def cost(self, speech, simembed): speech_enc = self.SpeechEncoderTop(self.SpeechEncoderBottom(speech)) embed_enc = simembed scores = loss.cosine_matrix(speech_enc, embed_enc) cost = loss.contrastive(scores, margin=self.config['margin_size']) return cost
def cost(self, speech, text): speech_enc = self.SpeechEncoderTop(self.SpeechEncoderBottom(speech)) text_enc = self.TextEncoderTop(self.TextEncoderBottom(text)) scores = loss.cosine_matrix(speech_enc, text_enc) cost = loss.contrastive(scores, margin=self.config['margin_size']) return cost
def cost(self, speech, image): speech_enc = self.SpeechEncoderTop(self.SpeechEncoderBottom(speech)) image_enc = self.ImageEncoder(image) scores = loss.cosine_matrix(speech_enc, image_enc) cost = loss.contrastive(scores, margin=self.config['margin_size']) return cost
def cost(self, text, image): text_enc = self.TextEncoderTop(self.TextEncoderBottom(text)) image_enc = self.ImageEncoder(image) scores = loss.cosine_matrix(text_enc, image_enc) cost = loss.contrastive(scores, margin=self.config['margin_size']) return cost
def cost(self, beg, end): beg_encoded = F.normalize(self.ProjBeg(self.Encode(beg)), p=2, dim=1) end_encoded = F.normalize(self.ProjEnd(self.Encode(end)), p=2, dim=1) scores = cosine_matrix(beg_encoded, end_encoded) return contrastive(scores, margin=self.config['audio']['margin_size'])