def valid(epoch): clevr = CLEVR(cfg.DATALOADER.FEATURES_PATH, 'val', transform=None) valid_set = DataLoader( clevr, batch_size=cfg.DATALOADER.BATCH_SIZE, num_workers=cfg.DATALOADER.NUM_WORKERS, collate_fn=collate_data, drop_last=True ) dataset = iter(valid_set) net_running.train(False) with torch.no_grad(): all_corrects = 0 for image, question, q_len, answer, _, _ in tqdm(dataset): image, question = image.to(device), question.to(device) output = net_running(image, question, q_len) correct = output.detach().argmax(1) == answer.to(device) all_corrects += correct.float().mean().item() if scheduler: scheduler.step(all_corrects / len(dataset)) print('Avg Acc: {:.5f}'.format(all_corrects / len(dataset))) clevr.close()
def valid(accum_net, clevr_dir, epoch): clevr = CLEVR(clevr_dir, "val", transform=None) valid_set = DataLoader(clevr, batch_size=batch_size, num_workers=4, collate_fn=collate_data) dataset = iter(valid_set) accum_net.train(False) family_correct = Counter() family_total = Counter() with torch.no_grad(): for image, question, q_len, answer, family in tqdm(dataset): image, question = image.to(device), question.to(device) output = accum_net(image, question, q_len) correct = output.detach().argmax(1) == answer.to(device) for c, fam in zip(correct, family): if c: family_correct[fam] += 1 family_total[fam] += 1 with open("log/log_{}.txt".format(str(epoch + 1).zfill(2)), "w") as w: for k, v in family_total.items(): w.write("{}: {:.5f}\n".format(k, family_correct[k] / v)) print("Avg Acc: {:.5f}".format( sum(family_correct.values()) / sum(family_total.values()))) clevr.close()
def valid(epoch): clevr = CLEVR(sys.argv[1], 'val', transform=None) valid_set = DataLoader(clevr, batch_size=batch_size, num_workers=4, collate_fn=collate_data) dataset = iter(valid_set) net_running.train(False) family_correct = Counter() family_total = Counter() with torch.no_grad(): for image, question, q_len, answer, family in tqdm(dataset): image, question = image.to(device), question.to(device) output = net_running(image, question, q_len) correct = output.detach().argmax(1) == answer.to(device) for c, fam in zip(correct, family): if c: family_correct[fam] += 1 family_total[fam] += 1 with open('log/log_{}.txt'.format(str(epoch + 1).zfill(2)), 'w') as w: for k, v in family_total.items(): w.write('{}: {:.5f}\n'.format(k, family_correct[k] / v)) print('Avg Acc: {:.5f}'.format( sum(family_correct.values()) / sum(family_total.values()))) clevr.close()
def train(epoch, dataset_type): root = args.root if dataset_type == "CLEVR": dataset_object = CLEVR(root, transform=transform) else: dataset_object = GQA(root, transform=transform) train_set = DataLoader(dataset_object, batch_size=BATCH_SIZE, num_workers=multiprocessing.cpu_count(), collate_fn=collate_data) dataset = iter(train_set) pbar = tqdm(dataset) running_loss = 0 correct_counts = 0 total_counts = 0 net.train() for image, question, q_len, answer in pbar: image, question, answer = ( image.to(DEVICE), question.to(DEVICE), answer.to(DEVICE), ) net.zero_grad() output = net(image, question, q_len) loss = criterion(output, answer) loss.backward() optimizer.step() correct = output.detach().argmax(1) == answer correct_counts += sum(correct).item() total_counts += image.size(0) correct = correct.clone().type( torch.FloatTensor).detach().sum() / BATCH_SIZE running_loss += loss.item() / BATCH_SIZE pbar.set_description( '[Training] Epoch: {}; Loss: {:.8f}; Acc: {:.5f}'.format( epoch + 1, loss.item(), correct)) print('[Training] loss: {:8f}, accuracy: {:5f}'.format( running_loss / len(train_set.dataset), correct_counts / total_counts)) dataset_object.close() return running_loss / len(train_set.dataset), correct_counts / total_counts
def train(net, accum_net, optimizer, criterion, clevr_dir, epoch): clevr = CLEVR(clevr_dir, transform=transform) train_set = DataLoader(clevr, batch_size=batch_size, num_workers=4, collate_fn=collate_data) dataset = iter(train_set) pbar = tqdm(dataset) moving_loss = 0 net.train(True) for i, (image, question, q_len, answer, _) in enumerate(pbar): image, question, answer = (image.to(device), question.to(device), answer.to(device)) net.zero_grad() output = net(image, question, q_len) loss = criterion(output, answer) loss.backward() # if wrapped in a DataParallel, the actual net is at DataParallel.module m = net.module if isinstance(net, nn.DataParallel) else net torch.nn.utils.clip_grad_norm_(m.mac.read.parameters(), 1) # torch.nn.utils.clip_grad_value_(net.parameters(), 0.05) # if i % 1000 == 0: # plot_grad_flow(net.named_parameters()) optimizer.step() correct = output.detach().argmax(1) == answer correct = torch.tensor(correct, dtype=torch.float32).sum() / batch_size if moving_loss == 0: moving_loss = correct else: moving_loss = moving_loss * 0.99 + correct * 0.01 pbar.set_description("Epoch: {}; Loss: {:.5f}; Acc: {:.5f}".format( epoch + 1, loss.item(), moving_loss)) accumulate(accum_net, net) clevr.close()
def test(accum_net, clevr_dir): print("Starting tests!") print(accum_net) clevr = CLEVR(clevr_dir, "val", transform=None) test_set = DataLoader(clevr, batch_size=batch_size, num_workers=4, collate_fn=collate_data) dataset = iter(test_set) accum_net.train(False) family_correct = Counter() family_total = Counter() with torch.no_grad(): for image, question, q_len, answer, family in tqdm(dataset): image, question = image.to(device), question.to(device) output = accum_net(image, question, q_len) # if wrapped in a DataParallel, the actual net is at DataParallel.module m = accum_net.module if isinstance(accum_net, nn.DataParallel) else accum_net # [{read, write}, n_steps, batch_size, {??????, n_memories}] attentions = m.saved_attns for i, step in enumerate(attentions): print(f"Step {i}") print("Read attn shape:", torch.tensor(step["read"][0]).shape) print(image.shape) sys.exit() correct = output.detach().argmax(1) == answer.to(device) for c, fam in zip(correct, family): if c: family_correct[fam] += 1 family_total[fam] += 1 with open("log/test_log.txt", "w") as w: for k, v in family_total.items(): w.write("{}: {:.5f}\n".format(k, family_correct[k] / v)) print("Avg Acc: {:.5f}".format( sum(family_correct.values()) / sum(family_total.values()))) clevr.close()
def valid(epoch, dataset_type): root = args.root if dataset_type == "CLEVR": dataset_object = CLEVR(root, 'val', transform=None) else: dataset_object = GQA(root, 'val', transform=None) valid_set = DataLoader(dataset_object, batch_size=BATCH_SIZE, num_workers=multiprocessing.cpu_count(), collate_fn=collate_data) dataset = iter(valid_set) net.eval() correct_counts = 0 total_counts = 0 running_loss = 0.0 with torch.no_grad(): pbar = tqdm(dataset) for image, question, q_len, answer in pbar: image, question, answer = ( image.to(DEVICE), question.to(DEVICE), answer.to(DEVICE), ) output = net(image, question, q_len) loss = criterion(output, answer) correct = output.detach().argmax(1) == answer correct_counts += sum(correct).item() total_counts += image.size(0) running_loss += loss.item() / BATCH_SIZE pbar.set_description( '[Val] Epoch: {}; Loss: {:.8f}; Acc: {:.5f}'.format( epoch + 1, loss.item(), correct_counts / total_counts)) print('[Val] loss: {:8f}, accuracy: {:5f}'.format( running_loss / len(valid_set.dataset), correct_counts / total_counts)) dataset_object.close() return running_loss / len(valid_set.dataset), correct_counts / total_counts
def train(epoch): clevr = CLEVR(sys.argv[1], transform=transform) train_set = DataLoader( clevr, batch_size=batch_size, num_workers=4, collate_fn=collate_data ) dataset = iter(train_set) pbar = tqdm(dataset) moving_loss = 0 net.train(True) for image, question, q_len, answer, _ in pbar: image, question, answer = ( image.to(device), question.to(device), answer.to(device), ) net.zero_grad() output = net(image, question, q_len) loss = criterion(output, answer) loss.backward() optimizer.step() correct = output.detach().argmax(1) == answer correct = torch.tensor(correct, dtype=torch.float32).sum() / batch_size if moving_loss == 0: moving_loss = correct else: moving_loss = moving_loss * 0.99 + correct * 0.01 pbar.set_description( 'Epoch: {}; Loss: {:.5f}; Acc: {:.5f}'.format( epoch + 1, loss.item(), moving_loss ) ) accumulate(net_running, net) clevr.close()
def train(epoch): clevr = CLEVR(cfg.DATALOADER.FEATURES_PATH, transform=transform) train_set = DataLoader( clevr, batch_size=cfg.DATALOADER.BATCH_SIZE, num_workers=cfg.DATALOADER.NUM_WORKERS, collate_fn=collate_data, drop_last=True ) dataset = iter(train_set) pbar = tqdm(dataset) moving_loss = 0 net.train(True) for image, question, q_len, answer, _, _ in pbar: image, question, answer = ( image.to(device), question.to(device), answer.to(device), ) net.zero_grad() output = net(image, question, q_len) loss = criterion(output, answer) loss.backward() if cfg.SOLVER.GRAD_CLIP: nn.utils.clip_grad_norm_(net.parameters(), cfg.SOLVER.GRAD_CLIP) optimizer.step() correct = output.detach().argmax(1) == answer accuracy = correct.float().mean().item() if moving_loss == 0: moving_loss = accuracy else: moving_loss = moving_loss * 0.99 + accuracy * 0.01 pbar.set_description( 'Epoch: {}; Loss: {:.5f}; Acc: {:.5f}'.format( epoch, loss.item(), moving_loss ) ) accumulate(net_running, net) clevr.close()
def valid(epoch, dataset_type): if dataset_type == "CLEVR": dataset_object = CLEVR('data/CLEVR_v1.0', 'val', transform=None) else: dataset_object = GQA('data/gqa', 'val', transform=None) valid_set = DataLoader(dataset_object, batch_size=BATCH_SIZE, num_workers=multiprocessing.cpu_count(), collate_fn=collate_data) dataset = iter(valid_set) net.eval() correct_counts = 0 total_counts = 0 running_loss = 0.0 with torch.no_grad(): pbar = tqdm(dataset) for image, question, q_len, answer in pbar: image, question, answer = ( image.to(DEVICE), question.to(DEVICE), answer.to(DEVICE), ) output = net(image, question, q_len) loss = criterion(output, answer) correct = output.detach().argmax(1) == answer correct_counts += sum(correct).item() total_counts += image.size(0) running_loss += loss.item() / BATCH_SIZE pbar.set_description( 'Epoch: {}; Loss: {:.8f}; Acc: {:.5f}'.format(epoch + 1, loss.item(), correct_counts / total_counts)) with open('log/log_{}.txt'.format(str(epoch + 1).zfill(2)), 'w') as w: w.write('{:.5f}\n'.format(correct_counts / total_counts)) print('Training loss: {:8f}, accuracy: {:5f}'.format(running_loss / len(valid_set.dataset), correct_counts / total_counts)) dataset_object.close()
def train(epoch): train_set = DataLoader( CLEVR( sys.argv[1], transform=transform, reverse_question=reverse_question, use_preprocessed=True, ), batch_size=batch_size, num_workers=n_worker, shuffle=True, collate_fn=collate_data, ) dataset = iter(train_set) pbar = tqdm(dataset) moving_loss = 0 relnet.train(True) for i, (image, question, q_len, answer, _) in enumerate(pbar): image, question, q_len, answer = ( image.to(device), question.to(device), torch.tensor(q_len), answer.to(device), ) relnet.zero_grad() output = relnet(image, question, q_len) loss = criterion(output, answer) loss.backward() nn.utils.clip_grad_norm_(relnet.parameters(), clip_norm) optimizer.step() correct = output.data.cpu().numpy().argmax( 1) == answer.data.cpu().numpy() correct = correct.sum() / batch_size if moving_loss == 0: moving_loss = correct else: moving_loss = moving_loss * 0.99 + correct * 0.01 pbar.set_description( 'Epoch: {}; Loss: {:.5f}; Acc: {:.5f}; LR: {:.6f}'.format( epoch + 1, loss.detach().item(), moving_loss, optimizer.param_groups[0]['lr'], ))
def valid(epoch): valid_set = DataLoader(CLEVR(args.data_dir, args.segs_dir, 'val'), batch_size=args.batch_size, shuffle=True, num_workers=16, collate_fn=collate_data, pin_memory=args.cuda) # dataset = iter(valid_set) relnet.train(True) # step = 0 avg_loss = 0 avg_acc = 0 for step, (apps, masks, num_layers, question, q_len, answer, family) in enumerate(valid_set): # step += 1 q_len = torch.LongTensor(np.array(q_len)) num_layers = torch.LongTensor(np.array(num_layers)) if args.cuda: apps, masks, num_layers, question, answer, q_len = \ Variable(apps).cuda(), Variable(masks).cuda(), Variable(num_layers).cuda(), Variable(question).cuda(), Variable(answer).cuda(), Variable(q_len).cuda() else: apps, masks, num_layers, question, answer, q_len = \ Variable(apps), Variable(masks), Variable(num_layers), Variable(question), Variable(answer), Variable(q_len) output = relnet(apps, masks, num_layers, question, q_len) # correct = output.data.cpu().numpy().argmax(1) == answer.data.cpu().numpy() pred_answer = output.data.cpu().numpy().argmax(1) accuracy = np.mean(answer.data.cpu().numpy() == pred_answer) avg_acc += accuracy loss = torch.sum(criterion(output, answer)) avg_loss += loss.data[0] if step % args.log_interval == 0: print ('Epoch: {}; Step: {:d}; Loss: {:.5f}; Avg_Accuracy: {:.5f}'. \ format(epoch, step, loss.data[0], avg_acc/(step+1))) with open('logs_valid.csv', 'a') as csvfile_valid: fieldnames_valid = ['epoch', 'valid_loss', 'valid_acc'] writer_valid = csv.DictWriter(csvfile_valid, fieldnames=fieldnames_valid) writer_valid.writerow({ 'epoch': epoch, 'valid_loss': avg_loss / (step + 1), 'valid_acc': avg_acc / (step + 1) }) print('Epoch: {:d}; Avg Acc: {:.5f}'.format(epoch, avg_acc / (step + 1)))
def valid(epoch): valid_set = DataLoader( CLEVR( sys.argv[1], 'val', transform=None, reverse_question=reverse_question, use_preprocessed=True, ), batch_size=batch_size // 2, num_workers=4, collate_fn=collate_data, ) dataset = iter(valid_set) relnet.eval() class_correct = Counter() class_total = Counter() with torch.no_grad(): for image, question, q_len, answer, answer_class in tqdm(dataset): image, question, q_len = ( image.to(device), question.to(device), torch.tensor(q_len), ) output = relnet(image, question, q_len) correct = output.data.cpu().numpy().argmax(1) == answer.numpy() for c, class_ in zip(correct, answer_class): if c: class_correct[class_] += 1 class_total[class_] += 1 class_correct['total'] = sum(class_correct.values()) class_total['total'] = sum(class_total.values()) with open('log/log_{}.txt'.format(str(epoch + 1).zfill(3)), 'w') as w: for k, v in class_total.items(): w.write('{}: {:.5f}\n'.format(k, class_correct[k] / v)) print('Avg Acc: {:.5f}'.format(class_correct['total'] / class_total['total']))
import sys import pickle from tqdm import tqdm from torch.utils.data import DataLoader from dataset import CLEVR, collate_data, transform from model import RelationNetworks batch_size = 64 n_epoch = 180 train_set = DataLoader(CLEVR(sys.argv[1], transform=transform), batch_size=batch_size, num_workers=4) for epoch in range(n_epoch): dataset = iter(train_set) pbar = tqdm(dataset) for image, question, q_len, answer in pbar: pass
import sys import pickle from collections import Counter import torch from tqdm import tqdm from torch.utils.data import DataLoader from dataset import CLEVR, collate_data, transform batch_size = 64 n_epoch = 180 train_set = DataLoader( CLEVR(sys.argv[1], 'val', transform=None), batch_size=batch_size, num_workers=4, collate_fn=collate_data, ) net = torch.load(sys.argv[2]) net.eval() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') for epoch in range(n_epoch): dataset = iter(train_set) pbar = tqdm(dataset) family_correct = Counter() family_total = Counter() for image, question, q_len, answer, family in pbar: image, question = image.to(device), question.to(device)
def train(epoch): train_set = DataLoader(CLEVR(args.data_dir, args.segs_dir, 'train'), batch_size=args.batch_size, shuffle=True, num_workers=16, collate_fn=collate_data, pin_memory=args.cuda) # dataset = iter(train_set) # pbar = tqdm(dataset) moving_loss = 0 relnet.train(True) avg_loss = 0 avg_acc = 0 for step, (apps, masks, num_layers, question, q_len, answer, family) in enumerate(train_set): # print (answer) # answer_value = answer # step += 1 start_time = time.time() q_len = torch.LongTensor(np.array(q_len)) num_layers = torch.LongTensor(np.array(num_layers)) if args.cuda: apps, masks, num_layers, question, answer, q_len = \ Variable(apps).cuda(), Variable(masks).cuda(), Variable(num_layers).cuda(), Variable(question).cuda(), Variable(answer).cuda(), Variable(q_len).cuda() else: apps, masks, num_layers, question, answer, q_len = \ Variable(apps), Variable(masks), Variable(num_layers), Variable(question), Variable(answer), Variable(q_len) relnet.zero_grad() output = relnet(apps, masks, num_layers, question, q_len) pred_answer = output.data.cpu().numpy().argmax(1) accuracy = np.mean(answer.data.cpu().numpy() == pred_answer) avg_acc += accuracy loss = torch.sum(criterion(output, answer)) loss.backward() optimizer.step() if moving_loss == 0: moving_loss = loss.data[0] else: moving_loss = moving_loss * 0.9 + loss.data[0] * 0.1 avg_loss += loss.data[0] # pbar.set_description('Epoch: {}; Loss: {:.5f}; Avg: {:.5f}'. \ # format(epoch + 1, loss.data[0], moving_loss)) exm_per_sec = args.batch_size / (time.time() - start_time) if step % args.log_interval == 0: print( '{}; Epoch: {}; Step: {:d}; Loss: {:.5f}; Avg: {:.5f}; Avg_Accuracy: {:.5f}; Example/sec: {:.5f}' .format(datetime.datetime.now(), epoch, step, loss.data[0], avg_loss / (step + 1), avg_acc / (step + 1), exm_per_sec)) # print ('{}; Epoch: {}; Step: {:d}; Loss: {:.5f}; Avg: {:.5f}; Avg_Accuracy: {:.5f}'.format(datetime.datetime.now(), epoch + start_from_epoch, step, loss.data[0], avg_loss/(step+1), avg_acc/(step+1))) with open('logs_train.csv', 'a') as csvfile_train: fieldnames_train = ['epoch', 'train_loss', 'train_acc'] writer_train = csv.DictWriter(csvfile_train, fieldnames=fieldnames_train) writer_train.writerow({ 'epoch': epoch, 'train_loss': avg_loss / (step + 1), 'train_acc': avg_acc / (step + 1) })
import sys import pickle from collections import Counter import torch from tqdm import tqdm from torch.utils.data import DataLoader from dataset import CLEVR, collate_data, transform batch_size = 64 n_epoch = 180 train_set = DataLoader( CLEVR(sys.argv[1], "val", transform=None), batch_size=batch_size, num_workers=4, collate_fn=collate_data, ) net = torch.load(sys.argv[2]) net.eval() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") for epoch in range(n_epoch): dataset = iter(train_set) pbar = tqdm(dataset) family_correct = Counter() family_total = Counter() for image, question, q_len, answer, family in pbar: image, question = image.to(device), question.to(device)