def __init__(self, args): ''' Args: input_dims - a length-3 tuple, contains (audio_dim, video_dim, text_dim) hidden_dims - another length-3 tuple, hidden dims of the sub-networks text_out - int, specifying the resulting dimensions of the text subnetwork dropouts - a length-4 tuple, contains (audio_dropout, video_dropout, text_dropout, post_fusion_dropout) output_dim - int, specifying the size of output rank - int, specifying the size of rank in LMF Output: (return value in forward) a scalar value between -3 and 3 ''' super(LMF, self).__init__() # dimensions are specified in the order of audio, video and text self.text_in, self.audio_in, self.video_in = args.feature_dims self.text_hidden = args.hidden_dim_t self.audio_hidden = args.hidden_dim_a self.video_hidden = args.hidden_dim_v self.audio_prob = args.dropouts_a self.video_prob = args.dropouts_v self.text_prob = args.dropouts_t self.post_fusion_prob = args.dropouts_f self.text_out= self.text_hidden // 2 self.output_dim = args.num_classes self.rank = args.rank self.use_softmax = args.use_softmax # define the pre-fusion subnetworks self.audio_subnet = SubNet(self.audio_in, self.audio_hidden, self.audio_prob) self.video_subnet = SubNet(self.video_in, self.video_hidden, self.video_prob) self.text_subnet = TextSubNet(self.text_in, self.text_hidden, self.text_out, dropout=self.text_prob) # define the post_fusion layers self.post_fusion_dropout = nn.Dropout(p=self.post_fusion_prob) # self.post_fusion_layer_1 = nn.Linear((self.text_out + 1) * (self.video_hidden + 1) * (self.audio_hidden + 1), self.post_fusion_dim) self.audio_factor = Parameter(torch.Tensor(self.rank, self.audio_hidden + 1, self.output_dim)) self.video_factor = Parameter(torch.Tensor(self.rank, self.video_hidden + 1, self.output_dim)) self.text_factor = Parameter(torch.Tensor(self.rank, self.text_out + 1, self.output_dim)) self.fusion_weights = Parameter(torch.Tensor(1, self.rank)) self.fusion_bias = Parameter(torch.Tensor(1, self.output_dim)) # init teh factors xavier_normal_(self.audio_factor) xavier_normal_(self.video_factor) xavier_normal_(self.text_factor) xavier_normal_(self.fusion_weights) self.fusion_bias.data.fill_(0)
def __init__(self, args): ''' Args: input_dims - a length-3 tuple, contains (audio_dim, video_dim, text_dim) hidden_dims - another length-3 tuple, similar to input_dims text_out - int, specifying the resulting dimensions of the text subnetwork dropouts - a length-4 tuple, contains (audio_dropout, video_dropout, text_dropout, post_fusion_dropout) post_fusion_dim - int, specifying the size of the sub-networks after tensorfusion Output: (return value in forward) a scalar value between -3 and 3 ''' super(TFN, self).__init__() # dimensions are specified in the order of audio, video and text self.text_in, self.audio_in, self.video_in = args.feature_dims self.text_hidden, self.audio_hidden, self.video_hidden = args.hidden_dims self.output_dim = args.num_classes if args.train_mode == "classification" else 1 self.text_out = args.text_out self.post_fusion_dim = args.post_fusion_dim self.audio_prob, self.video_prob, self.text_prob, self.post_fusion_prob = args.dropouts # define the pre-fusion subnetworks self.audio_subnet = SubNet(self.audio_in, self.audio_hidden, self.audio_prob) self.video_subnet = SubNet(self.video_in, self.video_hidden, self.video_prob) self.text_subnet = TextSubNet(self.text_in, self.text_hidden, self.text_out, dropout=self.text_prob) # define the post_fusion layers self.post_fusion_dropout = nn.Dropout(p=self.post_fusion_prob) self.post_fusion_layer_1 = nn.Linear( (self.text_out + 1) * (self.video_hidden + 1) * (self.audio_hidden + 1), self.post_fusion_dim) self.post_fusion_layer_2 = nn.Linear(self.post_fusion_dim, self.post_fusion_dim) self.post_fusion_layer_3 = nn.Linear(self.post_fusion_dim, self.output_dim) # in TFN we are doing a regression with constrained output range: (-3, 3), hence we'll apply sigmoid to output # shrink it to (0, 1), and scale\shift it back to range (-3, 3) self.output_range = Parameter(torch.FloatTensor([6]), requires_grad=False) self.output_shift = Parameter(torch.FloatTensor([-3]), requires_grad=False)