def __init__(self, num_classes=5, pretrained=True): super().__init__(num_classes=2, sequence_length=8, contains_dropout=False) self.r2plus1 = r2plus1d_18(pretrained=True) self.r2plus1.fc = nn.Identity() self._set_requires_grad_for_module(self.r2plus1, requires_grad=False) self.sync_net = PretrainedSyncNet() # self._set_requires_grad_for_module(self.sync_net, requires_grad=False) self.relu = nn.ReLU() self.out = nn.Sequential( nn.Linear(512 + 1024, 50), nn.ReLU(), nn.Linear(50, self.num_classes) ) self._init = False
def __init__(self, num_classes, sequence_length=8, pretrained=True): super().__init__(num_classes=2, sequence_length=sequence_length, contains_dropout=False) self.r2plus1 = SmallVideoNetworkPooledEmbedding(pretrained=pretrained) self.sync_net = PretrainedSyncNet() self._set_requires_grad_for_module(self.sync_net, requires_grad=False) # use standard audio_extractor with 1024 -> then max pooling? or average? # to lets say 8 values self.audio_pooling = nn.Sequential( SqueezeModule(1, squeeze=False, dim=1), nn.MaxPool1d(128, 128), SqueezeModule(1, squeeze=True, dim=1), ) self.out = nn.Sequential(nn.Linear(16, 50), nn.BatchNorm1d(50), nn.LeakyReLU(0.2), nn.Linear(50, 2))
def __init__(self, num_classes=5, pretrained=True): super().__init__( num_classes=num_classes, sequence_length=8, contains_dropout=False ) self.r2plus1 = torch.hub.load( "moabitcoin/ig65m-pytorch", "r2plus1d_34_8_kinetics", num_classes=400, pretrained=True, ) self.r2plus1.layer3 = nn.Identity() self.r2plus1.layer4 = nn.Identity() self.r2plus1.fc = nn.Identity() self.sync_net = PretrainedSyncNet() self._set_requires_grad_for_module(self.sync_net, requires_grad=False) self.relu = nn.ReLU() self.out = nn.Sequential( nn.Linear(128 + 1024, 50), nn.ReLU(), nn.Linear(50, self.num_classes) )
class NoisySyncAudioNet(BinaryEvaluationMixin, SequenceClassificationModel): def __init__(self, num_classes, pretrained=True): super().__init__(num_classes=2, sequence_length=8, contains_dropout=False) self.r2plus1 = self.r2plus1 = r2plus1d_18(pretrained=pretrained) self.r2plus1.layer2 = nn.Identity() self.r2plus1.layer3 = nn.Identity() self.r2plus1.layer4 = nn.Identity() self.r2plus1.fc = nn.Identity() self.sync_net = PretrainedSyncNet() # self._set_requires_grad_for_module(self.sync_net, requires_grad=False) self.relu = nn.ReLU() self.out = nn.Sequential(nn.Linear(64 + 1024, 50), nn.ReLU(), nn.Linear(50, self.num_classes)) self._init = False def forward(self, x): # def forward(self, video, audio): video, audio = x # bs x 8 x 3 x 112 x 112 , bs x 8 x 29 # video = x # bs x 8 x 3 x 112 x 112 , bs x 8 x 29 video = video.transpose(1, 2) video = self.r2plus1(video) # syncnet only uses 5 frames audio = audio[:, 2:-1] audio = (audio.reshape( (audio.shape[0], -1, 13)).unsqueeze(1)).transpose(-2, -1) audio = self.sync_net.audio_extractor(audio) flat = torch.cat((video, audio), dim=1) out = self.out(self.relu(flat)) return out def training_step(self, batch, batch_nb, system): x, (target, aud_noisy) = batch return super().training_step((x, aud_noisy), batch_nb, system) def aggregate_outputs(self, outputs, system): if not self._init: self._init = True system.file_list.class_to_idx = {"fake": 0, "youtube": 1} system.file_list.classes = ["fake", "youtube"] for x in outputs: x["target"] = x["target"][1] return super().aggregate_outputs(outputs, system)
class SyncAudioNet(SequenceClassificationModel): def __init__(self, num_classes=5, pretrained=True): super().__init__( num_classes=num_classes, sequence_length=8, contains_dropout=False ) self.r2plus1 = torch.hub.load( "moabitcoin/ig65m-pytorch", "r2plus1d_34_8_kinetics", num_classes=400, pretrained=True, ) self.r2plus1.layer3 = nn.Identity() self.r2plus1.layer4 = nn.Identity() self.r2plus1.fc = nn.Identity() self.sync_net = PretrainedSyncNet() self._set_requires_grad_for_module(self.sync_net, requires_grad=False) self.relu = nn.ReLU() self.out = nn.Sequential( nn.Linear(128 + 1024, 50), nn.ReLU(), nn.Linear(50, self.num_classes) ) def forward(self, x): # def forward(self, video, audio): video, audio = x # bs x 8 x 3 x 112 x 112 , bs x 8 x 29 # video = x # bs x 8 x 3 x 112 x 112 , bs x 8 x 29 video = video.transpose(1, 2) video = self.r2plus1(video) # syncnet only uses 5 frames audio = audio[:, 2:-1] audio = (audio.reshape((audio.shape[0], -1, 13)).unsqueeze(1)).transpose(-2, -1) audio = self.sync_net.audio_extractor(audio) flat = torch.cat((video, audio), dim=1) out = self.out(self.relu(flat)) return out def training_step(self, batch, batch_nb, system): x, (target, _) = batch return super().training_step((x, target), batch_nb, system) def aggregate_outputs(self, outputs, system): for x in outputs: x["target"] = x["target"][0] return super().aggregate_outputs(outputs, system)
class SmallEmbeddingSpace(BinaryEvaluationMixin, SequenceClassificationModel): def __init__(self, num_classes, sequence_length=8, pretrained=True): super().__init__(num_classes=2, sequence_length=sequence_length, contains_dropout=False) self.r2plus1 = SmallVideoNetworkPooledEmbedding(pretrained=pretrained) self.sync_net = PretrainedSyncNet() self._set_requires_grad_for_module(self.sync_net, requires_grad=False) # use standard audio_extractor with 1024 -> then max pooling? or average? # to lets say 8 values self.audio_pooling = nn.Sequential( SqueezeModule(1, squeeze=False, dim=1), nn.MaxPool1d(128, 128), SqueezeModule(1, squeeze=True, dim=1), ) self.out = nn.Sequential(nn.Linear(16, 50), nn.BatchNorm1d(50), nn.LeakyReLU(0.2), nn.Linear(50, 2)) def forward(self, x): video, audio = x # bs x 8 x 3 x 112 x 112 , bs x 8 x 4 x 13 # syncnet only uses 5 frames audio = audio[:, 2:-1] audio = (audio.reshape( (audio.shape[0], -1, 13)).unsqueeze(1)).transpose(-2, -1) audio = self.sync_net.audio_extractor(audio) # bs x 1024 audio = self.audio_pooling(audio) # bs x 8 video = self.r2plus1(video) # bs x 8 embedding = torch.cat((video, audio), dim=1) return self.out(embedding) def training_step(self, batch, batch_nb, system): x, target = batch return super().training_step((x, target[0] // 4), batch_nb, system) def aggregate_outputs(self, outputs, system): for output in outputs: output["target"] = output["target"][0] // 4 return super().aggregate_outputs(outputs, system)
def __init__(self, num_classes=5, pretrained=True): super().__init__(num_classes=num_classes, sequence_length=8, contains_dropout=False) self.r2plus1 = r2plus1d_18(pretrained=True) self.r2plus1.layer3 = nn.Identity() self.r2plus1.layer4 = nn.Identity() self.r2plus1.fc = nn.Identity() self.sync_net = PretrainedSyncNet() self._set_requires_grad_for_module(self.sync_net, requires_grad=False) self.relu = nn.ReLU() self.padding = nn.ReflectionPad2d((0, 1, 0, 0)) self.upsample = nn.Upsample(size=(8, 56, 56)) self.merge_conv: nn.Module = nn.Sequential( Conv2Plus1D(128, 64, 144, 1), nn.BatchNorm3d(64), nn.ReLU(inplace=True)) self.out = nn.Sequential(nn.Linear(128, 50), nn.ReLU(), nn.Linear(50, self.num_classes))
def __init__(self, num_classes=5, sequence_length=8, pretrained=True): super().__init__( num_classes=num_classes, sequence_length=sequence_length, contains_dropout=False, ) self.r2plus1 = r2plus1d_18(pretrained=pretrained) self.r2plus1.layer2 = nn.Identity() self.r2plus1.layer3 = nn.Identity() self.r2plus1.layer4 = nn.Identity() self.r2plus1.fc = nn.Identity() self.video_mlp = nn.Sequential( nn.Linear(64, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Linear(512, 1024) ) self.sync_net = PretrainedSyncNet() self._set_requires_grad_for_module(self.sync_net, requires_grad=False) self.audio_extractor = self.sync_net.audio_extractor self.c_loss = ContrastiveLoss(20) self.log_class_loss = False
# flake8: noqa #%% from importlib import reload import forgery_detection.data.file_lists from forgery_detection.data import set reload(set) from forgery_detection.data.utils import resized_crop from forgery_detection.models.audio.similarity_stuff import ( PretrainedSimilarityNet, PretrainedSyncNet, ) p = PretrainedSyncNet().eval() # .to("cuda:2") p._shuffle_audio = lambda x: x # f = FileList.load("/data/ssd1/file_lists/c40/tracked_resampled_faces.json") f = forgery_detection.data.file_lists.FileList.load( "/data/ssd1/file_lists/c40/tracked_resampled_faces_yt_only_112_16_sequence_length.json" ) #%% import torch from forgery_detection.data import loading reload(loading) from torchvision import transforms d = f.get_dataset( "test", sequence_length=5,
class SyncAudioNetRegularized(SequenceClassificationModel): def __init__(self, num_classes=5, pretrained=True): super().__init__(num_classes=num_classes, sequence_length=8, contains_dropout=False) self.r2plus1 = r2plus1d_18(pretrained=pretrained) self.r2plus1.layer2 = nn.Identity() self.r2plus1.layer3 = nn.Identity() self.r2plus1.layer4 = nn.Identity() self.r2plus1.fc = nn.Identity() self.sync_net = PretrainedSyncNet() self._set_requires_grad_for_module(self.sync_net, requires_grad=False) self.relu = nn.ReLU() self.out = nn.Sequential(nn.Linear(64 + 1024, 50), nn.ReLU(), nn.Linear(50, self.num_classes)) def forward(self, x): # def forward(self, video, audio): video, audio = x # bs x 8 x 3 x 112 x 112 , bs x 8 x 29 # video = x # bs x 8 x 3 x 112 x 112 , bs x 8 x 29 video = video.transpose(1, 2) video = self.r2plus1(video) # syncnet only uses 5 frames audio = audio[:, 2:-1] audio = (audio.reshape( (audio.shape[0], -1, 13)).unsqueeze(1)).transpose(-2, -1) audio = self.sync_net.audio_extractor(audio) flat = torch.cat((video, audio), dim=1) out = self.out(self.relu(flat)) return out, (video, audio) def weight_loss(self): vid_weights = self.out[0].weight[:, :64].std() aud_weights = self.out[0].weight[:, 64:].std() return torch.norm(vid_weights - aud_weights, 2) * 1e3 def training_step(self, batch, batch_nb, system): x, (target, _) = batch pred, embeddings = self.forward(x) classification_loss = self.loss(pred, target) weight_loss = self.weight_loss() lightning_log = {"loss": classification_loss + weight_loss} with torch.no_grad(): train_acc = self.calculate_accuracy(pred, target) tensorboard_log = { "loss": { "train": classification_loss + weight_loss }, "classification_loss": classification_loss, "weight_loss": weight_loss, "acc": { "train": train_acc }, "vid_std": torch.std(self.out[0].weight[:, :64]), "aud_std": torch.std(self.out[0].weight[:, 64:]), } return tensorboard_log, lightning_log def aggregate_outputs(self, outputs, system): if len(system.val_dataloader()) > 1: outputs = outputs[0] with torch.no_grad(): pred = torch.cat([x["pred"][0] for x in outputs], 0) target = torch.cat([x["target"][0] for x in outputs], 0) loss_mean_classification = self.loss(pred, target) pred = pred.cpu() target = target.cpu() pred = F.softmax(pred, dim=1) acc_mean = self.calculate_accuracy(pred, target) # confusion matrix class_accuracies = system.log_confusion_matrix(target, pred) weight_loss = self.weight_loss() tensorboard_log = { "loss": loss_mean_classification + weight_loss, "acc": acc_mean, "class_acc": class_accuracies, "classification_loss": loss_mean_classification, "weight_loss": weight_loss, "vid_std": torch.std(self.out[0].weight[:, :64]), "aud_std": torch.std(self.out[0].weight[:, 64:]), } # if system.global_step > 0: self.log_class_loss = True return tensorboard_log, {}