def main(num_epochs, batch_size): torch.device(device) dataset = get_audio_video_dataset( data_directory, max_length_in_seconds=1, pad_and_truncate=True ) #why is there double indexing eg_data = dataset[0][0] #, Dataset(test_split) train_dataset = Dataset(dataset, True) test_dataset = Dataset(dataset) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=hparams.num_workers, pin_memory=False ) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=batch_size, shuffle=True, num_workers=hparams.num_workers, pin_memory=False ) train_dataloader_len = len(train_dataloader) model = Model(audio_size = eg_data[0].size(), video_size=eg_data[1].size(), loss_type='bce') model = model.to(device) if hparams.model == 'video_transformer': checkpt = torch.load("/work/sbali/VideoSound-Matching/audio_classification/model_state/bce_video_transformer.pt") model.load_state_dict(checkpt) loss_fn = VideoMatchingLoss().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) with open(f'nk_results_bce2_{hparams.model}.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerow(["epoch", "train loss", "train accuracy", "test loss", "test accuracy"]) test_loss = 0 test_correct = 0 if True: with torch.no_grad(): for sample_idx, (audio1, audio2, video, target) in tqdm(enumerate(test_dataloader)): b = audio1.shape[0] audio1, audio2, video, target = audio1.to(device), audio2, video.to(device), target.to(device) audio1_enc, video_enc = model(audio1, video) loss, pred = loss_fn(audio1_enc, video_enc, target) test_loss += b * loss.mean().item() predicted = (pred >= 0.5) * torch.ones(pred.shape).to(device) test_correct += (predicted == target).sum().item() print(test_correct) print(f"Evaluation loss: {test_loss / test_dataset.__len__()}") print(f"Evaluation accuracy: {100 * test_correct / test_dataset.__len__()}") writer.writerow([epoch, (train_loss / train_dataset.__len__()), (100 * train_correct / train_dataset.__len__()), (test_loss / test_dataset.__len__()), (100 * test_correct / test_dataset.__len__())])
def main(): torch.device(device) dataset = get_audio_video_dataset(data_directory, max_length_in_seconds=1, pad_and_truncate=True) eg_data = dataset[0][0] test_dataset = Dataset(dataset) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=False) test_dataloader_len = len(test_dataloader) model = Model(audio_size=eg_data[0].size(), video_size=eg_data[1].size(), loss_type='bce') model = model.to(device) checkpt = torch.load(hparams.checkpoint) model.load_state_dict(checkpt) model = OneShotLearning(model).to(device) loss_fn = VideoMatchingLoss().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) with open(f'test_bce_{hparams.model}.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerow(["test loss", "test accuracy"]) test_loss = 0 test_correct = 0 with torch.no_grad(): for sample_idx, (val, comp_vals) in tqdm(enumerate(test_dataloader)): sim_vals = model(val, comp_vals) #val = torch.argmax(sim_vals) res = torch.ones(sim_vals.shape).to(device) * (sim_vals >= 0.5) val = 1 test_correct += (res).sum().item() print(sim_vals, test_correct) print(f"Evaluation loss: {test_loss / test_dataset.__len__()}") print( f"Evaluation accuracy: {100 * test_correct / test_dataset.__len__()}" ) writer.writerow([(test_loss / len(test_dataset)), (100 * test_correct / len(test_dataset))])
def main(num_epochs, batch_size): torch.device(device) dataset = get_audio_video_dataset( data_directory, max_length_in_seconds=1, pad_and_truncate=True ) #why is there double indexing eg_data = dataset[0][0] #, Dataset(test_split) train_dataset = Dataset(dataset, True) test_dataset = Dataset(dataset) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=False ) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=False ) train_dataloader_len = len(train_dataloader) model = Model(audio_size = eg_data[0].size(), video_size=eg_data[1].size(), loss_type='multi') model = model.to(device) ''' if hparams.model == 'video_transformer': checkpt = torch.load("/work/sbali/VideoSound-Matching/audio_classification/model_state/mult_video_transformer.pt") model.load_state_dict(checkpt) elif hparams.model == 'video_cnn_lstm': checkpt = torch.load("/work/sbali/VideoSound-Matching/audio_classification/model_state/mult_video_cnn_lstm.pt") model.load_state_dict(checkpt) ''' loss_fn = VideoMatchingLoss().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=hparams.lr) with open(f'2multi_{hparams.model}results.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerow(["epoch", "train loss", "train accuracy", "test loss", "test accuracy"]) for epoch in tqdm(range(num_epochs)): model.train() train_loss = 0 train_correct = 0 for sample_idx, (audio, video, target) in tqdm(enumerate(train_dataloader)): b = audio[0].shape[0] optimizer.zero_grad() audio[0], audio[1] = audio[0].to(device), audio[1].to(device) video[0], video[1] = video[0].to(device), video[1].to(device) target = target.to(device) pred, audio1_enc, audio2_enc, video1_enc, video2_enc = model(audio, video) #pred, audio1_enc, audio2_enc, video1_enc, video2_enc, label loss, predicted = loss_fn(pred, audio1_enc, audio2_enc, video1_enc, video2_enc, target) loss.backward() optimizer.step() train_loss += b * loss.mean().item() #pred = pred.cpu() predicted = (predicted >= 0.5) * torch.ones(predicted.shape).to(device) train_correct += (predicted == target).sum().item() print( f"{epoch:06d}-[{sample_idx + 1}/{train_dataloader_len}]: {loss.mean().item()} : {train_correct}", flush=True ) print(f"Train loss: {train_loss / train_dataset.__len__()}") print(f"Train accuracy: {100 * train_correct / train_dataset.__len__()}") # Save the model after every epoch (just in case end before num_epochs epochs) torch.save(model.state_dict(), f"/work/sbali/VideoSound-Matching/audio_classification/model_state/3mult_{hparams.model}.pt") total_length = len(test_dataset) model.eval() test_loss = 0 test_correct = 0 with torch.no_grad(): for sample_idx, (audio, video, target) in tqdm(enumerate(test_dataloader)): b = audio[0].shape[0] audio[0], audio[1] = audio[0].to(device), audio[1].to(device) video[0], video[1] = video[0].to(device), video[1].to(device) target = target.to(device) pred, audio1_enc, audio2_enc, video1_enc, video2_enc = model(audio, video) #pred, audio1_enc, audio2_enc, video1_enc, video2_enc, label loss, predicted = loss_fn(pred, audio1_enc, audio2_enc, video1_enc, video2_enc, target) test_loss += b * loss.mean().item() print(predicted.shape, test_dataset.__len__()) predicted = (predicted >= 0.5) * torch.ones(predicted.shape).to(device) test_correct += (predicted == target).sum().item() print(f"Evaluation loss: {test_loss / test_dataset.__len__()}") print(f"Evaluation accuracy: {100 * test_correct / test_dataset.__len__()}") writer.writerow([epoch, (train_loss / train_dataset.__len__()), (100 * train_correct / train_dataset.__len__()), (test_loss / test_dataset.__len__()), (100 * test_correct / test_dataset.__len__())])
def main(num_epochs, batch_size): torch.device(device) dataset = get_audio_video_dataset(data_directory, max_length_in_seconds=1, pad_and_truncate=True) #why is there double indexing eg_data = dataset[0][0] ''' dataset = Dataset(dataset) dataset_len = len(dataset) train_len = round(dataset_len * 0.8) test_len = dataset_len - train_len ''' #, Dataset(test_split) train_dataset = Dataset(dataset, True) test_dataset = Dataset(dataset) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=False) test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=False) train_dataloader_len = len(train_dataloader) model = Model(audio_size=eg_data[0].size(), video_size=eg_data[1].size(), loss_type='triplet') model = model.to(device) loss_fn = VideoMatchingLoss().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) with open('results.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerow([ "epoch", "train loss", "train accuracy", "test loss", "test accuracy" ]) for epoch in tqdm(range(num_epochs)): model.train() train_loss = 0 train_correct = 0 for sample_idx, (audio, video) in tqdm(enumerate(train_dataloader)): b = audio[0].shape[0] optimizer.zero_grad() audio[0], audio[1] = audio[0].to(device), audio[1].to(device) video = video.to(device) audio1_enc, audio2_enc, video1_enc = model(audio, video) loss, predicted = loss_fn(audio1_enc, audio2_enc, video1_enc) loss.backward() optimizer.step() train_loss += b * loss.mean().item() #pred = pred.cpu() #torch.argmin(pred, dim=1) train_correct += (predicted == 1).sum().item() print( f"{epoch:06d}-[{sample_idx + 1}/{train_dataloader_len}]: {loss.mean().item()} : {train_correct}", flush=True) print(f"Train loss: {train_loss / train_dataset.__len__()}") print( f"Train accuracy: {100 * train_correct / train_dataset.__len__()}" ) # Save the model after every epoch (just in case end before num_epochs epochs) torch.save(model.state_dict(), f"model_state/triplet_{hparams.model}.pt") total_length = len(test_dataset) model.eval() test_loss = 0 test_correct = 0 with torch.no_grad(): for sample_idx, (audio, video) in tqdm(enumerate(test_dataloader)): b = audio[0].shape[0] audio[0], audio[1] = audio[0].to(device), audio[1].to( device) video = video.to(device) audio1_enc, audio2_enc, video1_enc = model(audio, video) loss, predicted = loss_fn(audio1_enc, audio2_enc, video1_enc) test_loss += b * loss.mean().item() test_correct += (predicted).sum().item() print(f"Evaluation loss: {test_loss / test_dataset.__len__()}") print( f"Evaluation accuracy: {100 * test_correct / test_dataset.__len__()}" ) writer.writerow([ epoch, (train_loss / train_dataset.__len__()), (100 * train_correct / train_dataset.__len__()), (test_loss / test_dataset.__len__()), (100 * test_correct / test_dataset.__len__()) ])
def train(num_epochs, batch_size): torch.device(device) # Load and set up data dataset = get_audio_video_dataset( data_directory, max_length_in_seconds=1, pad_and_truncate=True ) audio_size = dataset[0][0][0].size() video_size = dataset[0][0][1].size() train_dataset = Dataset(dataset, True) val_dataset = Dataset(dataset) train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, num_workers=hparams.num_workers, pin_memory=False ) val_dataloader = torch.utils.data.DataLoader( val_dataset, batch_size=batch_size, shuffle=True, num_workers=hparams.num_workers, pin_memory=False ) train_dataloader_len = len(train_dataloader) # Set up training structures model = Model(audio_size = audio_size, video_size=video_size, loss_type=hparams.loss).to(device) loss_fn = Loss().to(device) if hparams.optimizer == "Adam": # also default optimizer optimizer = torch.optim.Adam(model.parameters(), lr=hparams.lr) elif hparams.optimizer == "SGD": optimizer = torch.optim.SGD(model.parameters(), lr=hparams.lr, momentum=hparams.momentum) with open(f'{hparams.model}_{hparams.loss}.csv', 'w', newline='') as f: writer = csv.writer(f) writer.writerow(["epoch", "train loss", "train accuracy", "test loss", "test accuracy"]) # Start training for epoch in tqdm(range(num_epochs)): # Run model on training data model.train() train_loss = 0 train_correct = 0 for sample_idx, (audio1, audio2, video1, video2, target) in tqdm(enumerate(train_dataloader)): b = audio1.shape[0] optimizer.zero_grad() if hparams.loss == "multi": audio1, audio2 = audio1.to(device), audio2.to(device) video1, video2 = video1.to(device), video2.to(device) target = target.to(device) elif hparams.loss == "triplet": audio1, audio2, video1, target = audio1.to(device), audio2.to(device), video1.to(device), target.to(device) else: audio1, video2, target = audio1.to(device), video2.to(device), target.to(device) # Run and update model if hparams.loss == "bce": audio_enc, video_enc = model(audio1, video2) loss, pred = loss_fn(audio_enc, video_enc, target) elif hparams.loss == "cos": pred, enc = model(audio1, video2) loss, pred = loss_fn(pred, enc, target) elif hparams.loss == "multi": pred, audio1_enc, audio2_enc, video1_enc, video2_enc = model((audio1, audio2), (video1, video2)) loss, pred = loss_fn(pred, audio1_enc, audio2_enc, video1_enc, video2_enc, target) elif hparams.loss == "triplet": audio1_enc, audio2_enc, video1_enc = model((audio1, audio2), video1) loss, predicted = loss_fn(audio1_enc, audio2_enc, video1_enc) loss.backward() optimizer.step() # Update loss and accuracy train_loss += b * loss.mean().item() predicted = (pred >= 0.5) * torch.ones(pred.shape).to(device) train_correct += (predicted == target).sum().item() print( f"{epoch:06d}-[{sample_idx + 1}/{train_dataloader_len}]: {loss.mean().item()} : {train_correct}", flush=True ) print(f"Train loss: {train_loss / train_dataset.__len__()}") print(f"Train accuracy: {100 * train_correct / train_dataset.__len__()}") # Save the model after every epoch torch.save(model.state_dict(), f"model_state/{hparams.model}_{hparams.loss}_{epoch}.pt") # Run model on validation data model.eval() val_loss = 0 val_correct = 0 with torch.no_grad(): for sample_idx, (audio1, audio2, video1, video2, target) in tqdm(enumerate(val_dataloader)): b = audio1.shape[0] if hparams.loss == "multi": audio1, audio2 = audio1.to(device), audio2.to(device) video1, video2 = video1.to(device), video2.to(device) target = target.to(device) elif hparams.loss == "triplet": audio1, audio2, video1, target = audio1.to(device), audio2.to(device), video1.to(device), target.to(device) else: audio1, video2, target = audio1.to(device), video2.to(device), target.to(device) # Run model if hparams.loss == "bce": audio_enc, video_enc = model(audio1, video2) loss, pred = loss_fn(audio_enc, video_enc, target) elif hparams.loss == "cos": pred, enc = model(audio1, video2) loss, pred = loss_fn(pred, enc, target) elif hparams.loss == "multi": pred, audio1_enc, audio2_enc, video1_enc, video2_enc = model((audio1, audio2), (video1, video2)) loss, pred = loss_fn(pred, audio1_enc, audio2_enc, video1_enc, video2_enc, target) elif hparams.loss == "triplet": audio1_enc, audio2_enc, video1_enc = model((audio1, audio2), video1) loss, predicted = loss_fn(audio1_enc, audio2_enc, video1_enc) # Update loss and accuracy val_loss += b * loss.mean().item() predicted = (pred >= 0.5) * torch.ones(pred.shape).to(device) val_correct += (predicted == target).sum().item() print(f"Evaluation loss: {val_loss / val_dataset.__len__()}") print(f"Evaluation accuracy: {100 * val_correct / val_dataset.__len__()}") # Save this epochs results writer.writerow([epoch, (train_loss / train_dataset.__len__()), (100 * train_correct / train_dataset.__len__()), (val_loss / val_dataset.__len__()), (100 * val_correct / val_dataset.__len__())])