def valid(epoch): clevr = CLEVR(sys.argv[1], 'val', transform=None) valid_set = DataLoader(clevr, batch_size=batch_size, num_workers=4, collate_fn=collate_data) dataset = iter(valid_set) net_running.train(False) family_correct = Counter() family_total = Counter() with torch.no_grad(): for image, question, q_len, answer, family in tqdm(dataset): image, question = image.to(device), question.to(device) output = net_running(image, question, q_len) correct = output.detach().argmax(1) == answer.to(device) for c, fam in zip(correct, family): if c: family_correct[fam] += 1 family_total[fam] += 1 with open('log/log_{}.txt'.format(str(epoch + 1).zfill(2)), 'w') as w: for k, v in family_total.items(): w.write('{}: {:.5f}\n'.format(k, family_correct[k] / v)) print('Avg Acc: {:.5f}'.format( sum(family_correct.values()) / sum(family_total.values()))) clevr.close()
def valid(accum_net, clevr_dir, epoch): clevr = CLEVR(clevr_dir, "val", transform=None) valid_set = DataLoader(clevr, batch_size=batch_size, num_workers=4, collate_fn=collate_data) dataset = iter(valid_set) accum_net.train(False) family_correct = Counter() family_total = Counter() with torch.no_grad(): for image, question, q_len, answer, family in tqdm(dataset): image, question = image.to(device), question.to(device) output = accum_net(image, question, q_len) correct = output.detach().argmax(1) == answer.to(device) for c, fam in zip(correct, family): if c: family_correct[fam] += 1 family_total[fam] += 1 with open("log/log_{}.txt".format(str(epoch + 1).zfill(2)), "w") as w: for k, v in family_total.items(): w.write("{}: {:.5f}\n".format(k, family_correct[k] / v)) print("Avg Acc: {:.5f}".format( sum(family_correct.values()) / sum(family_total.values()))) clevr.close()
def valid(epoch): clevr = CLEVR(cfg.DATALOADER.FEATURES_PATH, 'val', transform=None) valid_set = DataLoader( clevr, batch_size=cfg.DATALOADER.BATCH_SIZE, num_workers=cfg.DATALOADER.NUM_WORKERS, collate_fn=collate_data, drop_last=True ) dataset = iter(valid_set) net_running.train(False) with torch.no_grad(): all_corrects = 0 for image, question, q_len, answer, _, _ in tqdm(dataset): image, question = image.to(device), question.to(device) output = net_running(image, question, q_len) correct = output.detach().argmax(1) == answer.to(device) all_corrects += correct.float().mean().item() if scheduler: scheduler.step(all_corrects / len(dataset)) print('Avg Acc: {:.5f}'.format(all_corrects / len(dataset))) clevr.close()
def train(epoch, dataset_type): root = args.root if dataset_type == "CLEVR": dataset_object = CLEVR(root, transform=transform) else: dataset_object = GQA(root, transform=transform) train_set = DataLoader(dataset_object, batch_size=BATCH_SIZE, num_workers=multiprocessing.cpu_count(), collate_fn=collate_data) dataset = iter(train_set) pbar = tqdm(dataset) running_loss = 0 correct_counts = 0 total_counts = 0 net.train() for image, question, q_len, answer in pbar: image, question, answer = ( image.to(DEVICE), question.to(DEVICE), answer.to(DEVICE), ) net.zero_grad() output = net(image, question, q_len) loss = criterion(output, answer) loss.backward() optimizer.step() correct = output.detach().argmax(1) == answer correct_counts += sum(correct).item() total_counts += image.size(0) correct = correct.clone().type( torch.FloatTensor).detach().sum() / BATCH_SIZE running_loss += loss.item() / BATCH_SIZE pbar.set_description( '[Training] Epoch: {}; Loss: {:.8f}; Acc: {:.5f}'.format( epoch + 1, loss.item(), correct)) print('[Training] loss: {:8f}, accuracy: {:5f}'.format( running_loss / len(train_set.dataset), correct_counts / total_counts)) dataset_object.close() return running_loss / len(train_set.dataset), correct_counts / total_counts
def train(net, accum_net, optimizer, criterion, clevr_dir, epoch): clevr = CLEVR(clevr_dir, transform=transform) train_set = DataLoader(clevr, batch_size=batch_size, num_workers=4, collate_fn=collate_data) dataset = iter(train_set) pbar = tqdm(dataset) moving_loss = 0 net.train(True) for i, (image, question, q_len, answer, _) in enumerate(pbar): image, question, answer = (image.to(device), question.to(device), answer.to(device)) net.zero_grad() output = net(image, question, q_len) loss = criterion(output, answer) loss.backward() # if wrapped in a DataParallel, the actual net is at DataParallel.module m = net.module if isinstance(net, nn.DataParallel) else net torch.nn.utils.clip_grad_norm_(m.mac.read.parameters(), 1) # torch.nn.utils.clip_grad_value_(net.parameters(), 0.05) # if i % 1000 == 0: # plot_grad_flow(net.named_parameters()) optimizer.step() correct = output.detach().argmax(1) == answer correct = torch.tensor(correct, dtype=torch.float32).sum() / batch_size if moving_loss == 0: moving_loss = correct else: moving_loss = moving_loss * 0.99 + correct * 0.01 pbar.set_description("Epoch: {}; Loss: {:.5f}; Acc: {:.5f}".format( epoch + 1, loss.item(), moving_loss)) accumulate(accum_net, net) clevr.close()
def test(accum_net, clevr_dir): print("Starting tests!") print(accum_net) clevr = CLEVR(clevr_dir, "val", transform=None) test_set = DataLoader(clevr, batch_size=batch_size, num_workers=4, collate_fn=collate_data) dataset = iter(test_set) accum_net.train(False) family_correct = Counter() family_total = Counter() with torch.no_grad(): for image, question, q_len, answer, family in tqdm(dataset): image, question = image.to(device), question.to(device) output = accum_net(image, question, q_len) # if wrapped in a DataParallel, the actual net is at DataParallel.module m = accum_net.module if isinstance(accum_net, nn.DataParallel) else accum_net # [{read, write}, n_steps, batch_size, {??????, n_memories}] attentions = m.saved_attns for i, step in enumerate(attentions): print(f"Step {i}") print("Read attn shape:", torch.tensor(step["read"][0]).shape) print(image.shape) sys.exit() correct = output.detach().argmax(1) == answer.to(device) for c, fam in zip(correct, family): if c: family_correct[fam] += 1 family_total[fam] += 1 with open("log/test_log.txt", "w") as w: for k, v in family_total.items(): w.write("{}: {:.5f}\n".format(k, family_correct[k] / v)) print("Avg Acc: {:.5f}".format( sum(family_correct.values()) / sum(family_total.values()))) clevr.close()
def valid(epoch, dataset_type): root = args.root if dataset_type == "CLEVR": dataset_object = CLEVR(root, 'val', transform=None) else: dataset_object = GQA(root, 'val', transform=None) valid_set = DataLoader(dataset_object, batch_size=BATCH_SIZE, num_workers=multiprocessing.cpu_count(), collate_fn=collate_data) dataset = iter(valid_set) net.eval() correct_counts = 0 total_counts = 0 running_loss = 0.0 with torch.no_grad(): pbar = tqdm(dataset) for image, question, q_len, answer in pbar: image, question, answer = ( image.to(DEVICE), question.to(DEVICE), answer.to(DEVICE), ) output = net(image, question, q_len) loss = criterion(output, answer) correct = output.detach().argmax(1) == answer correct_counts += sum(correct).item() total_counts += image.size(0) running_loss += loss.item() / BATCH_SIZE pbar.set_description( '[Val] Epoch: {}; Loss: {:.8f}; Acc: {:.5f}'.format( epoch + 1, loss.item(), correct_counts / total_counts)) print('[Val] loss: {:8f}, accuracy: {:5f}'.format( running_loss / len(valid_set.dataset), correct_counts / total_counts)) dataset_object.close() return running_loss / len(valid_set.dataset), correct_counts / total_counts
def train(epoch): clevr = CLEVR(sys.argv[1], transform=transform) train_set = DataLoader( clevr, batch_size=batch_size, num_workers=4, collate_fn=collate_data ) dataset = iter(train_set) pbar = tqdm(dataset) moving_loss = 0 net.train(True) for image, question, q_len, answer, _ in pbar: image, question, answer = ( image.to(device), question.to(device), answer.to(device), ) net.zero_grad() output = net(image, question, q_len) loss = criterion(output, answer) loss.backward() optimizer.step() correct = output.detach().argmax(1) == answer correct = torch.tensor(correct, dtype=torch.float32).sum() / batch_size if moving_loss == 0: moving_loss = correct else: moving_loss = moving_loss * 0.99 + correct * 0.01 pbar.set_description( 'Epoch: {}; Loss: {:.5f}; Acc: {:.5f}'.format( epoch + 1, loss.item(), moving_loss ) ) accumulate(net_running, net) clevr.close()
def train(epoch): clevr = CLEVR(cfg.DATALOADER.FEATURES_PATH, transform=transform) train_set = DataLoader( clevr, batch_size=cfg.DATALOADER.BATCH_SIZE, num_workers=cfg.DATALOADER.NUM_WORKERS, collate_fn=collate_data, drop_last=True ) dataset = iter(train_set) pbar = tqdm(dataset) moving_loss = 0 net.train(True) for image, question, q_len, answer, _, _ in pbar: image, question, answer = ( image.to(device), question.to(device), answer.to(device), ) net.zero_grad() output = net(image, question, q_len) loss = criterion(output, answer) loss.backward() if cfg.SOLVER.GRAD_CLIP: nn.utils.clip_grad_norm_(net.parameters(), cfg.SOLVER.GRAD_CLIP) optimizer.step() correct = output.detach().argmax(1) == answer accuracy = correct.float().mean().item() if moving_loss == 0: moving_loss = accuracy else: moving_loss = moving_loss * 0.99 + accuracy * 0.01 pbar.set_description( 'Epoch: {}; Loss: {:.5f}; Acc: {:.5f}'.format( epoch, loss.item(), moving_loss ) ) accumulate(net_running, net) clevr.close()
def valid(epoch, dataset_type): if dataset_type == "CLEVR": dataset_object = CLEVR('data/CLEVR_v1.0', 'val', transform=None) else: dataset_object = GQA('data/gqa', 'val', transform=None) valid_set = DataLoader(dataset_object, batch_size=BATCH_SIZE, num_workers=multiprocessing.cpu_count(), collate_fn=collate_data) dataset = iter(valid_set) net.eval() correct_counts = 0 total_counts = 0 running_loss = 0.0 with torch.no_grad(): pbar = tqdm(dataset) for image, question, q_len, answer in pbar: image, question, answer = ( image.to(DEVICE), question.to(DEVICE), answer.to(DEVICE), ) output = net(image, question, q_len) loss = criterion(output, answer) correct = output.detach().argmax(1) == answer correct_counts += sum(correct).item() total_counts += image.size(0) running_loss += loss.item() / BATCH_SIZE pbar.set_description( 'Epoch: {}; Loss: {:.8f}; Acc: {:.5f}'.format(epoch + 1, loss.item(), correct_counts / total_counts)) with open('log/log_{}.txt'.format(str(epoch + 1).zfill(2)), 'w') as w: w.write('{:.5f}\n'.format(correct_counts / total_counts)) print('Training loss: {:8f}, accuracy: {:5f}'.format(running_loss / len(valid_set.dataset), correct_counts / total_counts)) dataset_object.close()