Ejemplo n.º 1
0
def main(num_epochs, batch_size):
    torch.device(device)

    dataset = get_audio_video_dataset(
        data_directory, max_length_in_seconds=1, pad_and_truncate=True
    )

    #why is there double indexing
    eg_data = dataset[0][0]
    #, Dataset(test_split)
    train_dataset = Dataset(dataset, True)
    test_dataset = Dataset(dataset)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, num_workers=hparams.num_workers, pin_memory=False
    )
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset, batch_size=batch_size, shuffle=True, num_workers=hparams.num_workers, pin_memory=False
    )
    train_dataloader_len = len(train_dataloader)
    model = Model(audio_size = eg_data[0].size(), video_size=eg_data[1].size(), loss_type='bce')
    model = model.to(device)
    if hparams.model == 'video_transformer':
        checkpt = torch.load("/work/sbali/VideoSound-Matching/audio_classification/model_state/bce_video_transformer.pt")
        model.load_state_dict(checkpt)
    loss_fn = VideoMatchingLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    with open(f'nk_results_bce2_{hparams.model}.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["epoch", "train loss", "train accuracy", "test loss", "test accuracy"])
        test_loss = 0
        test_correct = 0
        if True:
            with torch.no_grad():
                for sample_idx, (audio1, audio2, video, target) in tqdm(enumerate(test_dataloader)):
                    b = audio1.shape[0]
                    audio1, audio2, video, target = audio1.to(device), audio2, video.to(device), target.to(device)
                    audio1_enc, video_enc = model(audio1, video)
                    loss, pred = loss_fn(audio1_enc, video_enc, target)
                    test_loss += b * loss.mean().item()
                    predicted = (pred >= 0.5) * torch.ones(pred.shape).to(device)
                    test_correct += (predicted == target).sum().item()
                    print(test_correct)

                print(f"Evaluation loss: {test_loss / test_dataset.__len__()}")
                print(f"Evaluation accuracy: {100 * test_correct / test_dataset.__len__()}")
            
            writer.writerow([epoch, (train_loss / train_dataset.__len__()), (100 * train_correct / train_dataset.__len__()),
                                    (test_loss / test_dataset.__len__()), (100 * test_correct / test_dataset.__len__())])
Ejemplo n.º 2
0
def main():
    torch.device(device)
    dataset = get_audio_video_dataset(data_directory,
                                      max_length_in_seconds=1,
                                      pad_and_truncate=True)

    eg_data = dataset[0][0]
    test_dataset = Dataset(dataset)

    test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=batch_size,
                                                  shuffle=True,
                                                  num_workers=1,
                                                  pin_memory=False)
    test_dataloader_len = len(test_dataloader)
    model = Model(audio_size=eg_data[0].size(),
                  video_size=eg_data[1].size(),
                  loss_type='bce')
    model = model.to(device)
    checkpt = torch.load(hparams.checkpoint)
    model.load_state_dict(checkpt)
    model = OneShotLearning(model).to(device)

    loss_fn = VideoMatchingLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    with open(f'test_bce_{hparams.model}.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["test loss", "test accuracy"])
        test_loss = 0
        test_correct = 0
        with torch.no_grad():
            for sample_idx, (val,
                             comp_vals) in tqdm(enumerate(test_dataloader)):
                sim_vals = model(val, comp_vals)
                #val = torch.argmax(sim_vals)
                res = torch.ones(sim_vals.shape).to(device) * (sim_vals >= 0.5)
                val = 1
                test_correct += (res).sum().item()
                print(sim_vals, test_correct)
            print(f"Evaluation loss: {test_loss / test_dataset.__len__()}")
            print(
                f"Evaluation accuracy: {100 * test_correct / test_dataset.__len__()}"
            )

        writer.writerow([(test_loss / len(test_dataset)),
                         (100 * test_correct / len(test_dataset))])
Ejemplo n.º 3
0
def main(num_epochs, batch_size):
    torch.device(device)

    dataset = get_audio_video_dataset(
        data_directory, max_length_in_seconds=1, pad_and_truncate=True
    )

    #why is there double indexing
    eg_data = dataset[0][0]

    #, Dataset(test_split)
    train_dataset = Dataset(dataset, True)
    test_dataset = Dataset(dataset)


    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=False
    )
    test_dataloader = torch.utils.data.DataLoader(
        test_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=False
    )
    train_dataloader_len = len(train_dataloader)
    model = Model(audio_size = eg_data[0].size(), video_size=eg_data[1].size(), loss_type='multi')
    model = model.to(device)
    '''
    if hparams.model == 'video_transformer':
        checkpt = torch.load("/work/sbali/VideoSound-Matching/audio_classification/model_state/mult_video_transformer.pt")
        model.load_state_dict(checkpt)
    elif hparams.model == 'video_cnn_lstm':
        checkpt = torch.load("/work/sbali/VideoSound-Matching/audio_classification/model_state/mult_video_cnn_lstm.pt")
        model.load_state_dict(checkpt)
    '''
    loss_fn = VideoMatchingLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=hparams.lr)
    with open(f'2multi_{hparams.model}results.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["epoch", "train loss", "train accuracy", "test loss", "test accuracy"])
        
        for epoch in tqdm(range(num_epochs)):
            model.train()
            train_loss = 0
            train_correct = 0
            for sample_idx, (audio, video, target) in tqdm(enumerate(train_dataloader)):
                b = audio[0].shape[0]
                optimizer.zero_grad()
                audio[0], audio[1] = audio[0].to(device), audio[1].to(device)
                video[0], video[1] = video[0].to(device), video[1].to(device)
                target = target.to(device)
                pred, audio1_enc, audio2_enc, video1_enc, video2_enc = model(audio, video)
                #pred, audio1_enc, audio2_enc, video1_enc, video2_enc, label
                loss, predicted = loss_fn(pred, audio1_enc, audio2_enc, video1_enc, video2_enc, target)
                loss.backward()
                optimizer.step()
                train_loss += b * loss.mean().item()
                #pred = pred.cpu()
                
                predicted = (predicted >= 0.5) * torch.ones(predicted.shape).to(device)
                
                train_correct += (predicted == target).sum().item()
                print(
                    f"{epoch:06d}-[{sample_idx + 1}/{train_dataloader_len}]: {loss.mean().item()} : {train_correct}", flush=True
                )

            print(f"Train loss: {train_loss / train_dataset.__len__()}")
            print(f"Train accuracy: {100 * train_correct / train_dataset.__len__()}")

            # Save the model after every epoch (just in case end before num_epochs epochs)
            torch.save(model.state_dict(), f"/work/sbali/VideoSound-Matching/audio_classification/model_state/3mult_{hparams.model}.pt")

            total_length = len(test_dataset)

            model.eval()

            test_loss = 0
            test_correct = 0
            with torch.no_grad():
                for sample_idx, (audio, video, target) in tqdm(enumerate(test_dataloader)):
                    b = audio[0].shape[0]
                    audio[0], audio[1] = audio[0].to(device), audio[1].to(device)
                    video[0], video[1] = video[0].to(device), video[1].to(device)
                    target = target.to(device)
                    pred, audio1_enc, audio2_enc, video1_enc, video2_enc = model(audio, video)
                    #pred, audio1_enc, audio2_enc, video1_enc, video2_enc, label
                    loss, predicted = loss_fn(pred, audio1_enc, audio2_enc, video1_enc, video2_enc, target)
                    test_loss += b * loss.mean().item()
                    print(predicted.shape,  test_dataset.__len__())
                    predicted = (predicted >= 0.5) * torch.ones(predicted.shape).to(device)

                    test_correct += (predicted == target).sum().item()

                print(f"Evaluation loss: {test_loss / test_dataset.__len__()}")
                print(f"Evaluation accuracy: {100 * test_correct / test_dataset.__len__()}")
            
            writer.writerow([epoch, (train_loss / train_dataset.__len__()), (100 * train_correct / train_dataset.__len__()),
                                    (test_loss / test_dataset.__len__()), (100 * test_correct / test_dataset.__len__())])
def main(num_epochs, batch_size):
    torch.device(device)

    dataset = get_audio_video_dataset(data_directory,
                                      max_length_in_seconds=1,
                                      pad_and_truncate=True)

    #why is there double indexing
    eg_data = dataset[0][0]
    '''
    dataset = Dataset(dataset)
    dataset_len = len(dataset)
    train_len = round(dataset_len * 0.8)
    test_len = dataset_len - train_len
    '''
    #, Dataset(test_split)
    train_dataset = Dataset(dataset, True)
    test_dataset = Dataset(dataset)

    train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   num_workers=1,
                                                   pin_memory=False)
    test_dataloader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=batch_size,
                                                  shuffle=True,
                                                  num_workers=1,
                                                  pin_memory=False)
    train_dataloader_len = len(train_dataloader)
    model = Model(audio_size=eg_data[0].size(),
                  video_size=eg_data[1].size(),
                  loss_type='triplet')
    model = model.to(device)
    loss_fn = VideoMatchingLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    with open('results.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow([
            "epoch", "train loss", "train accuracy", "test loss",
            "test accuracy"
        ])

        for epoch in tqdm(range(num_epochs)):
            model.train()
            train_loss = 0
            train_correct = 0
            for sample_idx, (audio,
                             video) in tqdm(enumerate(train_dataloader)):
                b = audio[0].shape[0]
                optimizer.zero_grad()
                audio[0], audio[1] = audio[0].to(device), audio[1].to(device)
                video = video.to(device)
                audio1_enc, audio2_enc, video1_enc = model(audio, video)
                loss, predicted = loss_fn(audio1_enc, audio2_enc, video1_enc)
                loss.backward()
                optimizer.step()
                train_loss += b * loss.mean().item()
                #pred = pred.cpu()
                #torch.argmin(pred, dim=1)
                train_correct += (predicted == 1).sum().item()
                print(
                    f"{epoch:06d}-[{sample_idx + 1}/{train_dataloader_len}]: {loss.mean().item()} : {train_correct}",
                    flush=True)

            print(f"Train loss: {train_loss / train_dataset.__len__()}")
            print(
                f"Train accuracy: {100 * train_correct / train_dataset.__len__()}"
            )

            # Save the model after every epoch (just in case end before num_epochs epochs)
            torch.save(model.state_dict(),
                       f"model_state/triplet_{hparams.model}.pt")

            total_length = len(test_dataset)

            model.eval()

            test_loss = 0
            test_correct = 0
            with torch.no_grad():
                for sample_idx, (audio,
                                 video) in tqdm(enumerate(test_dataloader)):
                    b = audio[0].shape[0]
                    audio[0], audio[1] = audio[0].to(device), audio[1].to(
                        device)
                    video = video.to(device)
                    audio1_enc, audio2_enc, video1_enc = model(audio, video)
                    loss, predicted = loss_fn(audio1_enc, audio2_enc,
                                              video1_enc)
                    test_loss += b * loss.mean().item()
                    test_correct += (predicted).sum().item()

                print(f"Evaluation loss: {test_loss / test_dataset.__len__()}")
                print(
                    f"Evaluation accuracy: {100 * test_correct / test_dataset.__len__()}"
                )

            writer.writerow([
                epoch, (train_loss / train_dataset.__len__()),
                (100 * train_correct / train_dataset.__len__()),
                (test_loss / test_dataset.__len__()),
                (100 * test_correct / test_dataset.__len__())
            ])
Ejemplo n.º 5
0
def train(num_epochs, batch_size):
    torch.device(device)

    # Load and set up data
    dataset = get_audio_video_dataset(
        data_directory, max_length_in_seconds=1, pad_and_truncate=True
    )

    audio_size = dataset[0][0][0].size()
    video_size = dataset[0][0][1].size()
    train_dataset = Dataset(dataset, True)
    val_dataset = Dataset(dataset)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, num_workers=hparams.num_workers, pin_memory=False
    )
    val_dataloader = torch.utils.data.DataLoader(
        val_dataset, batch_size=batch_size, shuffle=True, num_workers=hparams.num_workers, pin_memory=False
    )
    train_dataloader_len = len(train_dataloader)
    
    # Set up training structures
    model = Model(audio_size = audio_size, video_size=video_size, loss_type=hparams.loss).to(device)
    loss_fn = Loss().to(device)
    if hparams.optimizer == "Adam": # also default optimizer
        optimizer = torch.optim.Adam(model.parameters(), lr=hparams.lr)
    elif hparams.optimizer == "SGD":
        optimizer = torch.optim.SGD(model.parameters(), lr=hparams.lr, momentum=hparams.momentum)

    with open(f'{hparams.model}_{hparams.loss}.csv', 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["epoch", "train loss", "train accuracy", "test loss", "test accuracy"])
        
        # Start training
        for epoch in tqdm(range(num_epochs)):
            # Run model on training data
            model.train()
            train_loss = 0
            train_correct = 0
            for sample_idx, (audio1, audio2, video1, video2, target) in tqdm(enumerate(train_dataloader)):
                b = audio1.shape[0]
                optimizer.zero_grad()
                if hparams.loss == "multi":
                    audio1, audio2 = audio1.to(device), audio2.to(device)
                    video1, video2 = video1.to(device), video2.to(device)
                    target = target.to(device)
                elif hparams.loss == "triplet":
                    audio1, audio2, video1, target = audio1.to(device), audio2.to(device), video1.to(device), target.to(device)
                else:
                    audio1, video2, target = audio1.to(device), video2.to(device), target.to(device)
                    
                
                # Run and update model
                if hparams.loss == "bce":
                    audio_enc, video_enc = model(audio1, video2)
                    loss, pred = loss_fn(audio_enc, video_enc, target)
                elif hparams.loss == "cos":
                    pred, enc = model(audio1, video2)
                    loss, pred = loss_fn(pred, enc, target)
                elif hparams.loss == "multi":
                    pred, audio1_enc, audio2_enc, video1_enc, video2_enc = model((audio1, audio2), (video1, video2))
                    loss, pred = loss_fn(pred, audio1_enc, audio2_enc, video1_enc, video2_enc, target)
                elif hparams.loss == "triplet":
                    audio1_enc, audio2_enc, video1_enc = model((audio1, audio2), video1)
                    loss, predicted = loss_fn(audio1_enc, audio2_enc, video1_enc)

                loss.backward()
                optimizer.step()
                
                # Update loss and accuracy
                train_loss += b * loss.mean().item()
                predicted = (pred >= 0.5) * torch.ones(pred.shape).to(device)
                train_correct += (predicted == target).sum().item()
                print(
                    f"{epoch:06d}-[{sample_idx + 1}/{train_dataloader_len}]: {loss.mean().item()} : {train_correct}", flush=True
                )

            print(f"Train loss: {train_loss / train_dataset.__len__()}")
            print(f"Train accuracy: {100 * train_correct / train_dataset.__len__()}")

            # Save the model after every epoch
            torch.save(model.state_dict(), f"model_state/{hparams.model}_{hparams.loss}_{epoch}.pt")

            # Run model on validation data
            model.eval()
            val_loss = 0
            val_correct = 0
            with torch.no_grad():
                for sample_idx, (audio1, audio2, video1, video2, target) in tqdm(enumerate(val_dataloader)):
                    b = audio1.shape[0]
                    if hparams.loss == "multi":
                        audio1, audio2 = audio1.to(device), audio2.to(device)
                        video1, video2 = video1.to(device), video2.to(device)
                        target = target.to(device)
                    elif hparams.loss == "triplet":
                        audio1, audio2, video1, target = audio1.to(device), audio2.to(device), video1.to(device), target.to(device)
                    else:
                        audio1, video2, target = audio1.to(device), video2.to(device), target.to(device)
                        
                    
                    # Run model
                    if hparams.loss == "bce":
                        audio_enc, video_enc = model(audio1, video2)
                        loss, pred = loss_fn(audio_enc, video_enc, target)
                    elif hparams.loss == "cos":
                        pred, enc = model(audio1, video2)
                        loss, pred = loss_fn(pred, enc, target)
                    elif hparams.loss == "multi":
                        pred, audio1_enc, audio2_enc, video1_enc, video2_enc = model((audio1, audio2), (video1, video2))
                        loss, pred = loss_fn(pred, audio1_enc, audio2_enc, video1_enc, video2_enc, target)
                    elif hparams.loss == "triplet":
                        audio1_enc, audio2_enc, video1_enc = model((audio1, audio2), video1)
                        loss, predicted = loss_fn(audio1_enc, audio2_enc, video1_enc)
                    
                    # Update loss and accuracy
                    val_loss += b * loss.mean().item()
                    predicted = (pred >= 0.5) * torch.ones(pred.shape).to(device)
                    val_correct += (predicted == target).sum().item()

                print(f"Evaluation loss: {val_loss / val_dataset.__len__()}")
                print(f"Evaluation accuracy: {100 * val_correct / val_dataset.__len__()}")
            
            # Save this epochs results
            writer.writerow([epoch, (train_loss / train_dataset.__len__()), (100 * train_correct / train_dataset.__len__()),
                                    (val_loss / val_dataset.__len__()), (100 * val_correct / val_dataset.__len__())])