def __init__(self, opt): super(MoreSupWeightModel, self).__init__() self.vocab_size = opt.vocab_size self.input_encoding_size = opt.input_encoding_size self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_size = opt.att_size self.batch_size = opt.batch_size * opt.seq_per_img self.rnn_atten = opt.rnn_atten self.num_parallels = opt.num_parallels self.sample_rate = opt.sample_rate self.use_linear = opt.use_linear self.rnn_size_list = opt.rnn_size_list self.gram_num = opt.gram_num self.logprob_pool_type = opt.logprob_pool_type # 0 mean 1 max # reviewnet self.use_reviewnet = opt.use_reviewnet if self.use_reviewnet == 1: self.review_length = opt.review_length self.review_nets = nn.ModuleList() for i in range(self.review_length): self.review_nets[i] = LSTM.LSTM_SOFT_ATT_NOX(self.rnn_size, self.att_size, self.drop_prob_lm) opt.att_size = self.review_length # LSTM # opt.input_encoding_size = opt.input_encoding_size * 2 self.core = rnn_utils.get_lstm(opt) if self.rnn_atten == "ATT_LSTM": self.atten = LSTM.LSTM_ATTEN_LAYER(self.rnn_size) # self.vocab_size + 1 -> self.input_encoding_size # self.vocab_size + 1 -> self.input_encoding_size if self.gram_num > 0: self.embed = nn.Sequential(nn.Embedding(self.vocab_size + 1, self.input_encoding_size), Embed.WordEmbed(self.gram_num)) # self.embed_tc = nn.Sequential(nn.Embedding(self.vocab_size + 1, self.input_encoding_size), # Embed.WordEmbed(self.gram_num)) # self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) # self.embed_tc = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) else: self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) self.embed_tc = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) if self.use_linear: # (batch_size * fc_feat_size) -> (batch_size * input_encoding_size) self.img_embed = nn.Linear(self.fc_feat_size, self.input_encoding_size) self.att_embed = nn.Linear(self.att_feat_size, self.input_encoding_size) # self.relu = nn.RReLU(inplace=True) self.relu = nn.ReLU() self.init_weight()
def __init__(self, opt): super(SCSTModel, self).__init__() self.vocab_size = opt.vocab_size self.input_encoding_size = opt.input_encoding_size self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_size = opt.att_size self.batch_size = opt.batch_size * opt.seq_per_img self.rnn_atten = opt.rnn_atten # LSTM if self.rnn_atten == "ATT_LSTM": self.atten = LSTM.LSTM_ATTEN_LAYER(self.rnn_size) # LSTM if self.rnn_type == "LSTM": self.core = LSTM.LSTM(self.input_encoding_size * 2, self.vocab_size + 1, self.rnn_size, dropout=self.drop_prob_lm) elif self.rnn_type == "LSTM_SOFT_ATT": self.core = LSTM.LSTM_SOFT_ATT(self.input_encoding_size * 2, self.vocab_size + 1, self.rnn_size, self.att_size, dropout=self.drop_prob_lm) elif self.rnn_type == "LSTM_DOUBLE_ATT": self.core = LSTM.LSTM_DOUBLE_ATT(self.input_encoding_size * 2, self.vocab_size + 1, self.rnn_size, self.att_size, dropout=self.drop_prob_lm) # self.vocab_size + 1 -> self.input_encoding_size self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) self.embed_tc = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) # (batch_size * fc_feat_size) -> (batch_size * input_encoding_size) self.img_embed = nn.Linear(self.fc_feat_size, self.input_encoding_size) self.att_embed = nn.Linear(self.att_feat_size, self.input_encoding_size) self.init_weights()
def __init__(self, opt): super(DoubleAttenMModel, self).__init__() self.vocab_size = opt.vocab_size self.input_encoding_size = opt.input_encoding_size self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_size = opt.att_size self.batch_size = opt.batch_size * opt.seq_per_img self.rnn_atten = opt.rnn_atten self.num_layers = opt.num_layers self.num_parallels = opt.num_parallels self.sample_rate = opt.sample_rate self.use_linear = opt.use_linear self.rnn_size_list = opt.rnn_size_list # reviewnet self.use_reviewnet = opt.use_reviewnet if self.use_reviewnet == 1: self.review_length = opt.review_length self.review_nets = nn.ModuleList() for i in range(self.review_length): self.review_nets[i] = LSTM.LSTM_SOFT_ATT_NOX( self.rnn_size, self.att_size, self.drop_prob_lm) opt.att_size = self.review_length # LSTM self.core = rnn_utils.get_lstm(opt) if self.rnn_atten == "ATT_LSTM": self.atten = LSTM.LSTM_ATTEN_LAYER(self.rnn_size) # self.vocab_size + 1 -> self.input_encoding_size self.embed = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) # self.embed_tc = nn.Embedding(self.vocab_size + 1, self.input_encoding_size) if self.use_linear: # (batch_size * fc_feat_size) -> (batch_size * input_encoding_size) self.img_embed = nn.Linear(self.fc_feat_size, self.rnn_size) self.att_embed = nn.Linear(self.att_feat_size, self.rnn_size) # self.relu = nn.RReLU(inplace=True) self.relu = nn.ReLU() self.init_weight()