def __init__(self, phase, cfg, size, base, extras, head, num_classes): super(association_lstm, self).__init__() self.phase = phase self.num_classes = num_classes self.cfg = vid self.priorbox = PriorBox(self.cfg) self.priors = Variable(self.priorbox.forward(), volatile=True) self.size = size # SSD network self.vgg = nn.ModuleList(base) # Layer learns to scale the l2 normalized features from conv4_3 self.L2Norm = L2Norm(512, 20) self.extras = nn.ModuleList(extras) self.loc = nn.ModuleList(head[0]) self.conf = nn.ModuleList(head[1]) self.roi_pool = _RoIPooling(self.cfg['POOLING_SIZE'], self.cfg['POOLING_SIZE'], 1.0 / 16.0) self.roi_align = RoIAlignAvg(self.cfg['POOLING_SIZE'], self.cfg['POOLING_SIZE'], 1.0 / 16.0) self.grid_size = self.cfg['POOLING_SIZE'] * 2 if self.cfg[ 'CROP_RESIZE_WITH_MAX_POOL'] else self.cfg['POOLING_SIZE'] self.roi_crop = _RoICrop() self.img_shape = (self.cfg['min_dim'], self.cfg['min_dim']) self.tensor_len = 4 + self.num_classes + 49 self.bnlstm1 = BNLSTM(input_size=84, hidden_size=150, batch_first=False, bidirectional=False) self.bnlstm2 = BNLSTM(input_size=150, hidden_size=300, batch_first=False, bidirectional=False) self.cls_pred = nn.Linear(300, self.num_classes) self.bbox_pred = nn.Linear(300, 4) self.association_pred = nn.Linear(300, 49) self.MultiProjectLoss = MultiProjectLoss(self.num_classes, 0, True, 3, 0.5) if phase == 'vid_train': self.softmax = nn.Softmax(dim=-1) #self.detect = Trnsform_target(num_classes, 200, 0.5, 0.01, 0.45) self.detect = train_target(num_classes, 200, 0.5, 0.01, 0.45)
def build(self, conf): conf.check() wscale = 1.0 embed_id = EmbedID(conf.n_vocab, conf.embed_size, ignore_label=-1) if conf.use_gpu: embed_id.to_gpu() lstm_attributes = {} lstm_units = [(conf.embed_size, conf.lstm_hidden_units[0])] lstm_units += zip(conf.lstm_hidden_units[:-1], conf.lstm_hidden_units[1:]) for i, (n_in, n_out) in enumerate(lstm_units): if conf.lstm_apply_batchnorm: lstm_attributes["layer_%i" % i] = BNLSTM(n_in, n_out) else: lstm_attributes["layer_%i" % i] = L.LSTM(n_in, n_out) lstm = LSTMNetwork(**lstm_attributes) lstm.n_layers = len(lstm_units) lstm.apply_dropout = conf.lstm_apply_dropout if conf.use_gpu: lstm.to_gpu() fc_attributes = {} fc_units = [(conf.lstm_hidden_units[-1], conf.fc_hidden_units[0])] fc_units += zip(conf.fc_hidden_units[:-1], conf.fc_hidden_units[1:]) if conf.fc_output_type == self.OUTPUT_TYPE_EMBED_VECTOR: fc_units += [(conf.fc_hidden_units[-1], conf.embed_size)] elif conf.fc_output_type == self.OUTPUT_TYPE_SOFTMAX: fc_units += [(conf.fc_hidden_units[-1], conf.n_vocab)] else: raise Exception() for i, (n_in, n_out) in enumerate(fc_units): fc_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale) fc_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_in) fc = FullyConnectedNetwork(**fc_attributes) fc.n_layers = len(fc_units) fc.activation_function = conf.fc_activation_function fc.apply_batchnorm = conf.fc_apply_batchnorm fc.apply_dropout = conf.fc_apply_dropout if conf.use_gpu: fc.to_gpu() return embed_id, lstm, fc
def __init__(self, p, n_units, train=True): super(LSTM, self).__init__( embed=L.EmbedID(p + 1, n_units), l1=BNLSTM(n_units, n_units), l2=L.Linear(n_units, p + 1), )
def build(self, conf): conf.check() wscale = 0.1 embed_id = EmbedID(conf.n_vocab, conf.char_embed_size, ignore_label=-1) if conf.gpu_enabled: embed_id.to_gpu() # encoder lstm_attributes = {} lstm_units = [(conf.char_embed_size, conf.word_encoder_lstm_units[0])] lstm_units += zip(conf.word_encoder_lstm_units[:-1], conf.word_encoder_lstm_units[1:]) for i, (n_in, n_out) in enumerate(lstm_units): if conf.word_encoder_lstm_apply_batchnorm: lstm_attributes["layer_%i" % i] = BNLSTM(n_in, n_out) else: lstm_attributes["layer_%i" % i] = LSTM(n_in, n_out) word_encoder_lstm = LSTMEncoder(**lstm_attributes) word_encoder_lstm.n_layers = len(lstm_units) if conf.gpu_enabled: word_encoder_lstm.to_gpu() # decoder lstm_attributes = {} lstm_units = [(conf.char_embed_size + conf.word_embed_size, conf.word_decoder_lstm_units[0])] lstm_units += zip(conf.word_decoder_lstm_units[:-1], conf.word_decoder_lstm_units[1:]) for i, (n_in, n_out) in enumerate(lstm_units): if conf.word_encoder_lstm_apply_batchnorm: lstm_attributes["layer_%i" % i] = BNLSTM(n_in, n_out) else: lstm_attributes["layer_%i" % i] = LSTM(n_in, n_out) lstm_attributes["layer_output"] = L.Linear(conf.word_decoder_lstm_units[-1], conf.n_vocab, wscale=wscale) word_decoder_lstm = LSTMDecoder(**lstm_attributes) word_decoder_lstm.n_layers = len(lstm_units) if conf.gpu_enabled: word_decoder_lstm.to_gpu() # word n-gram lstm_attributes = {} lstm_units = [(conf.word_embed_size, conf.word_ngram_lstm_units[0])] lstm_units += zip(conf.word_ngram_lstm_units[:-1], conf.word_ngram_lstm_units[1:]) for i, (n_in, n_out) in enumerate(lstm_units): if conf.word_encoder_lstm_apply_batchnorm: lstm_attributes["layer_%i" % i] = BNLSTM(n_in, n_out) else: lstm_attributes["layer_%i" % i] = LSTM(n_in, n_out) word_ngram_lstm = LSTMEncoder(**lstm_attributes) word_ngram_lstm.n_layers = len(lstm_units) if conf.gpu_enabled: word_ngram_lstm.to_gpu() # variational encoder for word n-gram fc_attributes = {} fc_units = [] if len(conf.word_ngram_fc_hidden_units) > 0: fc_units = [(conf.word_ngram_lstm_units[-1], conf.word_ngram_fc_hidden_units[0])] fc_units += zip(conf.word_ngram_fc_hidden_units[:-1], conf.word_ngram_fc_hidden_units[1:]) for i, (n_in, n_out) in enumerate(fc_units): fc_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale) fc_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out) fc_attributes["layer_mean"] = L.Linear(conf.word_ngram_fc_hidden_units[-1], conf.word_embed_size, wscale=wscale) fc_attributes["layer_var"] = L.Linear(conf.word_ngram_fc_hidden_units[-1], conf.word_embed_size, wscale=wscale) else: fc_attributes["layer_mean"] = L.Linear(conf.word_ngram_lstm_units[-1], conf.word_embed_size, wscale=wscale) fc_attributes["layer_var"] = L.Linear(conf.word_ngram_lstm_units[-1], conf.word_embed_size, wscale=wscale) word_ngram_fc = GaussianNetwork(**fc_attributes) word_ngram_fc.n_layers = len(fc_units) word_ngram_fc.nonlinear = conf.word_ngram_fc_nonlinear word_ngram_fc.apply_batchnorm = conf.word_ngram_fc_apply_batchnorm word_ngram_fc.apply_dropout = conf.word_ngram_fc_apply_dropout if conf.gpu_enabled: word_ngram_fc.to_gpu() # variational encoder fc_attributes = {} fc_units = [] if len(conf.word_encoder_fc_hidden_units) > 0: fc_units = [(conf.word_encoder_lstm_units[-1], conf.word_encoder_fc_hidden_units[0])] fc_units += zip(conf.word_encoder_fc_hidden_units[:-1], conf.word_encoder_fc_hidden_units[1:]) for i, (n_in, n_out) in enumerate(fc_units): fc_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale) fc_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out) fc_attributes["layer_mean"] = L.Linear(conf.word_encoder_fc_hidden_units[-1], conf.word_embed_size, wscale=wscale) fc_attributes["layer_var"] = L.Linear(conf.word_encoder_fc_hidden_units[-1], conf.word_embed_size, wscale=wscale) else: fc_attributes["layer_mean"] = L.Linear(conf.word_encoder_lstm_units[-1], conf.word_embed_size, wscale=wscale) fc_attributes["layer_var"] = L.Linear(conf.word_encoder_lstm_units[-1], conf.word_embed_size, wscale=wscale) word_encoder_fc = GaussianNetwork(**fc_attributes) word_encoder_fc.n_layers = len(fc_units) word_encoder_fc.nonlinear = conf.word_encoder_fc_nonlinear word_encoder_fc.apply_batchnorm = conf.word_encoder_fc_apply_batchnorm word_encoder_fc.apply_dropout = conf.word_encoder_fc_apply_dropout if conf.gpu_enabled: word_encoder_fc.to_gpu() # discriminator fc_attributes = {} fc_units = [(conf.word_embed_size, conf.discriminator_hidden_units[0])] fc_units += zip(conf.discriminator_hidden_units[:-1], conf.discriminator_hidden_units[1:]) fc_units += [(conf.discriminator_hidden_units[-1], 2)] for i, (n_in, n_out) in enumerate(fc_units): fc_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale) fc_attributes["batchnorm_%i" % i] = L.BatchNormalization(n_out) discriminator = MultiLayerPerceptron(**fc_attributes) discriminator.n_layers = len(fc_units) discriminator.nonlinear = conf.word_encoder_fc_nonlinear discriminator.apply_batchnorm = conf.word_encoder_fc_apply_batchnorm discriminator.apply_dropout = conf.word_encoder_fc_apply_dropout if conf.gpu_enabled: discriminator.to_gpu() return embed_id, word_encoder_lstm, word_encoder_fc, word_decoder_lstm, discriminator, word_ngram_lstm, word_ngram_fc
def __init__(self, conf, name="mono"): self.name = name conf.check() wscale = 0.1 forward_lstm_attributes = {} forward_lstm_units = [(conf.ndim_char_embed, conf.lstm_hidden_units[0]) ] forward_lstm_units += zip(conf.lstm_hidden_units[:-1], conf.lstm_hidden_units[1:]) for i, (n_in, n_out) in enumerate(forward_lstm_units): if conf.rnn_type == "dsgu": forward_lstm_attributes["layer_%i" % i] = StatefulDSGU( n_in, n_out) elif conf.rnn_type == "lstm": if conf.lstm_apply_batchnorm: forward_lstm_attributes["layer_%i" % i] = BNLSTM( n_in, n_out) else: forward_lstm_attributes["layer_%i" % i] = L.LSTM( n_in, n_out) elif conf.rnn_type == "gru": forward_lstm_attributes["layer_%i" % i] = L.StatefulGRU( n_in, n_out) else: raise NotImplementedError() self.forward_lstm = StackedLSTM(**forward_lstm_attributes) self.forward_lstm.n_layers = len(forward_lstm_units) self.forward_lstm.apply_dropout = conf.lstm_apply_dropout backward_lstm_attributes = {} backward_lstm_units = [(conf.ndim_char_embed, conf.lstm_hidden_units[0])] backward_lstm_units += zip(conf.lstm_hidden_units[:-1], conf.lstm_hidden_units[1:]) for i, (n_in, n_out) in enumerate(backward_lstm_units): if conf.rnn_type == "dsgu": backward_lstm_attributes["layer_%i" % i] = StatefulDSGU( n_in, n_out) elif conf.rnn_type == "lstm": if conf.lstm_apply_batchnorm: backward_lstm_attributes["layer_%i" % i] = BNLSTM( n_in, n_out) else: backward_lstm_attributes["layer_%i" % i] = L.LSTM( n_in, n_out) elif conf.rnn_type == "gru": backward_lstm_attributes["layer_%i" % i] = L.StatefulGRU( n_in, n_out) else: raise NotImplementedError() self.backward_lstm = StackedLSTM(**backward_lstm_attributes) self.backward_lstm.n_layers = len(backward_lstm_units) self.backward_lstm.apply_dropout = conf.lstm_apply_dropout self.char_embed = L.EmbedID(conf.n_vocab, conf.ndim_char_embed, ignore_label=-1) self.f_ym = L.Linear(conf.lstm_hidden_units[-1], conf.ndim_m, nobias=True) self.f_um = L.Linear(conf.lstm_hidden_units[-1], conf.ndim_m, nobias=True) attention_fc_attributes = {} if len(conf.attention_fc_hidden_units) == 0: attention_fc_hidden_units = [(conf.ndim_m, 1)] else: attention_fc_hidden_units = [(conf.ndim_m, conf.attention_fc_hidden_units[0])] attention_fc_hidden_units += zip( conf.attention_fc_hidden_units[:-1], conf.attention_fc_hidden_units[1:]) attention_fc_hidden_units += [(conf.attention_fc_hidden_units[-1], 1)] for i, (n_in, n_out) in enumerate(attention_fc_hidden_units): attention_fc_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale) self.attention_fc = FullyConnectedNetwork(**attention_fc_attributes) self.attention_fc.n_layers = len(attention_fc_hidden_units) self.attention_fc.hidden_activation_function = conf.attention_fc_hidden_activation_function self.attention_fc.output_activation_function = conf.attention_fc_output_activation_function self.attention_fc.apply_dropout = conf.attention_fc_apply_dropout self.f_rg = L.Linear(conf.lstm_hidden_units[-1], conf.ndim_g, nobias=True) self.f_ug = L.Linear(conf.lstm_hidden_units[-1], conf.ndim_g, nobias=True) reader_fc_attributes = {} if len(conf.reader_fc_hidden_units) == 0: reader_fc_hidden_units = [(conf.ndim_g, conf.n_vocab)] else: reader_fc_hidden_units = [(conf.ndim_g, conf.reader_fc_hidden_units[0])] reader_fc_hidden_units += zip(conf.reader_fc_hidden_units[:-1], conf.reader_fc_hidden_units[1:]) reader_fc_hidden_units += [(conf.reader_fc_hidden_units[-1], conf.n_vocab)] for i, (n_in, n_out) in enumerate(reader_fc_hidden_units): reader_fc_attributes["layer_%i" % i] = L.Linear(n_in, n_out, wscale=wscale) self.reader_fc = FullyConnectedNetwork(**reader_fc_attributes) self.reader_fc.n_layers = len(reader_fc_hidden_units) self.reader_fc.hidden_activation_function = conf.reader_fc_hidden_activation_function self.reader_fc.output_activation_function = conf.reader_fc_output_activation_function self.reader_fc.apply_dropout = conf.attention_fc_apply_dropout if conf.use_gpu: self.forward_lstm.to_gpu() self.backward_lstm.to_gpu() self.char_embed.to_gpu() self.attention_fc.to_gpu() self.reader_fc.to_gpu() self.f_ym.to_gpu() self.f_um.to_gpu() self.f_rg.to_gpu() self.f_ug.to_gpu() self.optimizer_char_embed = optimizers.Adam( alpha=conf.learning_rate, beta1=conf.gradient_momentum) self.optimizer_char_embed.setup(self.char_embed) self.optimizer_char_embed.add_hook(GradientClipping(10.0)) self.optimizer_forward_lstm = optimizers.Adam( alpha=conf.learning_rate, beta1=conf.gradient_momentum) self.optimizer_forward_lstm.setup(self.forward_lstm) self.optimizer_forward_lstm.add_hook(GradientClipping(10.0)) self.optimizer_backward_lstm = optimizers.Adam( alpha=conf.learning_rate, beta1=conf.gradient_momentum) self.optimizer_backward_lstm.setup(self.backward_lstm) self.optimizer_backward_lstm.add_hook(GradientClipping(10.0)) self.optimizer_f_um = optimizers.Adam(alpha=conf.learning_rate, beta1=conf.gradient_momentum) self.optimizer_f_um.setup(self.f_um) self.optimizer_f_um.add_hook(GradientClipping(10.0)) self.optimizer_f_ym = optimizers.Adam(alpha=conf.learning_rate, beta1=conf.gradient_momentum) self.optimizer_f_ym.setup(self.f_ym) self.optimizer_f_ym.add_hook(GradientClipping(10.0)) self.optimizer_attention_fc = optimizers.Adam( alpha=conf.learning_rate, beta1=conf.gradient_momentum) self.optimizer_attention_fc.setup(self.attention_fc) self.optimizer_attention_fc.add_hook(GradientClipping(10.0)) self.optimizer_f_rg = optimizers.Adam(alpha=conf.learning_rate, beta1=conf.gradient_momentum) self.optimizer_f_rg.setup(self.f_rg) self.optimizer_f_rg.add_hook(GradientClipping(10.0)) self.optimizer_f_ug = optimizers.Adam(alpha=conf.learning_rate, beta1=conf.gradient_momentum) self.optimizer_f_ug.setup(self.f_ug) self.optimizer_f_ug.add_hook(GradientClipping(10.0)) self.optimizer_reader_fc = optimizers.Adam( alpha=conf.learning_rate, beta1=conf.gradient_momentum) self.optimizer_reader_fc.setup(self.reader_fc) self.optimizer_reader_fc.add_hook(GradientClipping(10.0))