def make_encoder(opt, embeddings): """ Various encoder dispatcher function. Args: opt: the option in current environment. embeddings (Embeddings): vocab embeddings for this encoder. """ if opt.encoder_type == "transformer": return TransformerEncoder(opt.enc_layers, opt.rnn_size, opt.dropout, embeddings) elif opt.encoder_type == "cnn": return CNNEncoder(opt.enc_layers, opt.rnn_size, opt.cnn_kernel_width, opt.dropout, embeddings) elif opt.encoder_type == "mean": return MeanEncoder(opt.enc_layers, embeddings) elif opt.encoder_type == "trigramrnn": return RNNTrigramsEncoder(opt.rnn_type, True, opt.enc_layers, opt.rnn_size, opt.dropout, embeddings, opt.bridge) else: # NOTE: THIS IS WHAT GETS TRIGGERED BY DEFAULT EXPERIMENT # "rnn" or "brnn" print('About to make encoder') print(f"opt.rnn_type={opt.rnn_type}") print(f"opt.brnn={opt.brnn}") print(f"opt.enc_layers={opt.enc_layers}") print(f"opt.rnn_size ={opt.rnn_size}") print(f"opt.dropout={opt.dropout}") print(f"embeddings={embeddings}") print(f"opt.bridge={opt.bridge}") return RNNEncoder(opt.rnn_type, opt.brnn, opt.enc_layers, opt.rnn_size, opt.dropout, embeddings, opt.bridge)
def make_encoder(opt, embeddings, morph_embeddings=None): """ Various encoder dispatcher function. Args: opt: the option in current environment. embeddings (Embeddings): vocab embeddings for this encoder. """ if opt.encoder_type == "transformer": return TransformerEncoder(opt.enc_layers, opt.rnn_size, opt.dropout, embeddings) elif opt.encoder_type == "cnn": return CNNEncoder(opt.enc_layers, opt.rnn_size, opt.cnn_kernel_width, opt.dropout, embeddings) elif opt.encoder_type == "mean": return MeanEncoder(opt.enc_layers, embeddings) elif opt.encoder_type == "gcn": print('use gates = ', opt.gcn_use_gates) return GCNEncoder(embeddings, opt.gcn_num_inputs, opt.gcn_num_units, opt.gcn_num_labels, opt.gcn_num_layers, opt.gcn_in_arcs, opt.gcn_out_arcs, opt.gcn_batch_first, opt.gcn_residual, opt.gcn_use_gates, opt.gcn_use_glus, morph_embeddings) else: # "rnn" or "brnn" return RNNEncoder(opt.rnn_type, opt.brnn, opt.enc_layers, opt.rnn_size, opt.dropout, embeddings, opt.bridge)
def make_encoder(opt, embeddings, embeddings_inter=None): """ Various encoder dispatcher function. Args: opt: the option in current environment. embeddings (Embeddings): vocab embeddings for this encoder. """ if opt.encoder_type == "transformer": return TransformerEncoder(opt.enc_layers, opt.rnn_size, opt.dropout, embeddings) elif opt.encoder_type == "cnn": return CNNEncoder(opt.enc_layers, opt.rnn_size, opt.cnn_kernel_width, opt.dropout, embeddings) elif opt.encoder_type == "mean": return MeanEncoder(opt.enc_layers, embeddings) elif opt.encoder_type == "double_encoder": print("The double encoder will be needed here") opt.brnn = True return DoubleRNNEncoder(opt.rnn_type, opt.brnn, opt.dec_layers, opt.rnn_size, opt.dropout, embeddings, embeddings_inter) else: # "rnn" or "brnn" print("The double encoder will be needed here") opt.brnn = True return RNNEncoder(opt.rnn_type, opt.brnn, opt.dec_layers, opt.rnn_size, opt.dropout, embeddings)
def make_encoder(opt, embeddings): """ Various encoder dispatcher function. Args: opt: the option in current environment. embeddings (Embeddings): vocab embeddings for this encoder. """ if opt.encoder_type == "transformer": return TransformerEncoder(opt.enc_layers, opt.rnn_size, opt.dropout, embeddings) elif opt.encoder_type == "cnn": return CNNEncoder(opt.enc_layers, opt.rnn_size, opt.cnn_kernel_width, opt.dropout, embeddings) elif opt.encoder_type == "mean": return MeanEncoder(opt.enc_layers, embeddings) elif opt.encoder_type == "trigramrnn": return RNNTrigramsEncoder(opt.rnn_type, True, opt.enc_layers, opt.rnn_size, opt.dropout, embeddings, opt.bridge) else: # "rnn" or "brnn" return RNNEncoder(opt.rnn_type, opt.brnn, opt.enc_layers, opt.rnn_size, opt.dropout, embeddings, opt.bridge)
def make_encoder(opt, embeddings, for_vae=False): """ Various encoder dispatcher function. Args: opt: the option in current environment. embeddings (Embeddings): vocab embeddings for this encoder. """ if for_vae: enc_layers = opt.enc_layers rnn_size = opt.rnn_size_vae dropout = opt.dropout_vae else: enc_layers = opt.enc_layers rnn_size = opt.rnn_size dropout = opt.dropout if opt.encoder_type == "transformer": return TransformerEncoder(opt.enc_layers, opt.rnn_size, opt.dropout, embeddings) elif opt.encoder_type == "cnn": return CNNEncoder(opt.enc_layers, opt.rnn_size, opt.cnn_kernel_width, opt.dropout, embeddings) elif opt.encoder_type == "mean": return MeanEncoder(opt.enc_layers, embeddings) else: # "rnn" or "brnn" return RNNEncoder(opt.rnn_type, opt.brnn, enc_layers, rnn_size, dropout, embeddings)
def make_encoder(opt, embeddings, mmod_imgw=False): """ Various encoder dispatcher function. Args: opt: the option in current environment. embeddings (Embeddings): vocab embeddings for this encoder. """ if opt.encoder_type == "transformer": if mmod_imgw: return multimodal.MultiModalTransformerEncoder( opt.enc_layers, opt.rnn_size, opt.img_feat_dim, opt.dropout, embeddings) else: return TransformerEncoder(opt.enc_layers, opt.rnn_size, opt.dropout, embeddings) elif opt.encoder_type == "cnn": return CNNEncoder(opt.enc_layers, opt.rnn_size, opt.cnn_kernel_width, opt.dropout, embeddings) elif opt.encoder_type == "mean": return MeanEncoder(opt.enc_layers, embeddings) else: # "rnn" or "brnn" return RNNEncoder(opt.rnn_type, opt.brnn, opt.enc_layers, opt.rnn_size, opt.dropout, embeddings, opt.bridge)
def make_encoder(opt, embeddings): """ Various encoder dispatcher function. Args: opt: the option in current environment. embeddings (Embeddings): vocab embeddings for this encoder. """ if opt.encoder_type == "transformer": return TransformerEncoder(opt.enc_layers, opt.rnn_size, opt.dropout, embeddings) elif opt.encoder_type == "cnn": return CNNEncoder(opt.enc_layers, opt.rnn_size, opt.cnn_kernel_width, opt.dropout, embeddings) elif opt.encoder_type == "mean": return MeanEncoder(opt.enc_layers, embeddings) else: # "rnn" or "brnn" return RNNEncoder(opt.rnn_type, opt.brnn, opt.enc_layers, opt.rnn_size, opt.dropout, embeddings, opt.bridge, opt.elmo, opt.elmo_size, opt.elmo_options, opt.elmo_weight, opt.subword_elmo, opt.subword_elmo_size, opt.subword_elmo_options, opt.subword_weight, opt.subword_spm_model, opt.node2vec, opt.node2vec_emb_size, opt.node2vec_weight, use_gpu(opt))
def make_encoder(opt, embeddings): if opt.encoder_type == "transformer": return TransformerEncoder(opt.enc_layers, opt.rnn_size, opt.dropout, embeddings) elif opt.encoder_type == "cnn": return CNNEncoder(opt.enc_layers, opt.rnn_size, opt.cnn_kernel_width, opt.dropout, embeddings) elif opt.encoder_type == "mean": return MeanEncoder(opt.enc_layers, embeddings) else: # "rnn" or "brnn" return RNNEncoder(opt.rnn_type, opt.brnn, opt.enc_layers, opt.rnn_size, opt.dropout, embeddings, opt.bridge)
def make_encoder(opt, embeddings, stage1=True): """ Various encoder dispatcher function. Args: opt: the option in current environment. embeddings (Embeddings): vocab embeddings for this encoder. stage1: stage1 encoder """ if stage1: return MeanEncoder(opt.enc_layers1, embeddings, opt.src_word_vec_size, opt.attn_hidden, opt.dropout) else: # "rnn" or "brnn" return RNNEncoder(opt.rnn_type, opt.brnn2, opt.enc_layers2, opt.rnn_size, opt.dropout, embeddings, opt.bridge)
def __init__(self, inference_network_type, src_embeddings, tgt_embeddings, rnn_type, src_layers, tgt_layers, rnn_size, dropout, attn_type="general", dist_type="none", scoresF=F.softplus): super(InferenceNetwork, self).__init__() self.inference_network_type = inference_network_type self.attn_type = attn_type self.dist_type = dist_type self.scoresF = scoresF if dist_type == "none": self.mask_val = float("-inf") elif dist_type == "categorical": self.mask_val = -float('inf') else: raise Exception("Invalid distribution type") if inference_network_type == 'embedding_only': self.src_encoder = MeanEncoder(src_layers, src_embeddings) self.tgt_encoder = MeanEncoder(tgt_layers, tgt_embeddings) elif inference_network_type == 'brnn': self.src_encoder = RNNEncoder(rnn_type, True, src_layers, rnn_size, rnn_size, dropout, src_embeddings, False) self.tgt_encoder = RNNEncoder(rnn_type, True, tgt_layers, rnn_size, rnn_size, dropout, tgt_embeddings, False) elif inference_network_type == 'bigbrnn': self.src_encoder = RNNEncoder(rnn_type, True, src_layers, 2 * rnn_size, 2 * rnn_size, dropout, src_embeddings, False) self.tgt_encoder = RNNEncoder(rnn_type, True, tgt_layers, 2 * rnn_size, 2 * rnn_size, dropout, tgt_embeddings, False) elif inference_network_type == 'rnn': self.src_encoder = RNNEncoder(rnn_type, True, src_layers, rnn_size, dropout, src_embeddings, False) self.tgt_encoder = RNNEncoder(rnn_type, False, tgt_layers, rnn_size, dropout, tgt_embeddings, False) if inference_network_type == "bigbrnn": self.W = torch.nn.Linear(rnn_size * 2, rnn_size * 2, bias=False) else: self.W = torch.nn.Linear(rnn_size, rnn_size, bias=False) self.rnn_size = rnn_size
def __init__(self, inference_network_type, src_embeddings, tgt_embeddings, rnn_type, src_layers, tgt_layers, rnn_size, dropout, attn_type="mlp", dist_type="none", norm_alpha=1.0, norm_beta=1.0, normalization="bn"): super(InferenceNetwork, self).__init__() self.attn_type = attn_type self.dist_type = dist_type self.inference_network_type = inference_network_type self.normalization = normalization # trainable alpha and beta self.mean_norm_alpha = nn.Parameter(torch.FloatTensor([1.])) self.mean_norm_beta = nn.Parameter(torch.FloatTensor([0.])) self.std_norm_alpha = nn.Parameter(torch.FloatTensor([1.])) self.std_norm_beta = nn.Parameter(torch.FloatTensor([0.])) if dist_type == "none": self.mask_val = float("-inf") else: self.mask_val = 1e-2 if inference_network_type == 'embedding_only': #self.src_encoder = src_embeddings self.tgt_encoder = tgt_embeddings elif inference_network_type == 'brnn': #self.src_encoder = RNNEncoder(rnn_type, True, src_layers, rnn_size, # dropout, src_embeddings, False) self.tgt_encoder = RNNEncoder(rnn_type, True, tgt_layers, rnn_size, dropout, tgt_embeddings, False) elif inference_network_type == 'rnn': #self.src_encoder = RNNEncoder(rnn_type, False, src_layers, rnn_size, # dropout, src_embeddings, False) self.tgt_encoder = RNNEncoder(rnn_type, False, tgt_layers, rnn_size, dropout, tgt_embeddings, False) self.W = torch.nn.Linear(rnn_size, rnn_size) self.rnn_size = rnn_size # to parametrize log normal distribution H = rnn_size if self.attn_type == "general": self.linear_in = nn.Linear(H, H, bias=False) if self.dist_type == "normal": self.W_mu = self.linear_in self.W_sigma = nn.Linear(H, H, bias=False) elif self.attn_type == "mlpadd": self.linear_context = nn.Linear(H, H, bias=False) self.linear_query = nn.Linear(H, H, bias=True) self.v = nn.Linear(H, 1, bias=False) if self.dist_type == "normal": self.v_mu = self.v self.v_sigma = nn.Linear(H, 1, bias=False) elif self.attn_type == "mlp": if self.dist_type == "normal": # TODO(demi): make 100 configurable self.linear_1 = nn.Linear(rnn_size + rnn_size, 500) self.linear_2 = nn.Linear(500, 500) self.mean_out = nn.Linear(500, 1) self.std_out = nn.Linear(500, 1) self.softplus = nn.Softplus() elif self.attn_type == "dotmlp": self.linear_in = nn.Linear(H, H, bias=False) if self.dist_type == "normal": self.W_mu = self.linear_in pass # unfinished if self.normalization == "bn": if self.dist_type == "normal": self.bn_mu = nn.BatchNorm1d(1, affine=True) self.bn_std = nn.BatchNorm1d(1, affine=True) elif self.normalization == "ln": if self.dist_type == "normal": self.mean_norm_alpha = nn.Parameter(torch.Tensor([1])) self.std_norm_alpha = nn.Parameter(torch.Tensor([1])) self.mean_norm_beta = nn.Parameter(torch.Tensor([0])) self.std_norm_beta = nn.Parameter(torch.Tensor([0])) elif self.normalization == "lnsigma": if self.dist_type == "normal": self.mean_norm_beta = nn.Parameter(torch.Tensor([0])) self.std_norm_beta = nn.Parameter(torch.Tensor([0]))
def make_vi_model_mmt(model_opt, fields, gpu, checkpoint=None): """ Args: model_opt: the option loaded from checkpoint. fields: `Field` objects for the model. gpu(bool): whether to use gpu. checkpoint: the model gnerated by train phase, or a resumed snapshot model from a stopped training. Returns: the VI multimodal NMT model. - `vi-model1`: a model where there is one global latent variable Z used to predict the image features and to inform the decoder initialisation. """ assert model_opt.model_type in ["text", "img", "audio"], \ ("Unsupported model type %s" % (model_opt.model_type)) # infer dimensionality of global image features if model_opt.use_posterior_image_features: feat_size = 1000 else: if 'vgg' in model_opt.path_to_train_img_feats.lower(): feat_size = 4096 else: feat_size = 2048 model_opt.global_image_features_dim = feat_size # Make encoder. if model_opt.model_type == "text": src_dict = fields["src"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'src') src_embeddings = make_embeddings(model_opt, src_dict, feature_dicts) if model_opt.multimodal_model_type in MODEL_TYPES: encoder = make_encoder(model_opt, src_embeddings) else: raise Exception("Multi-modal model type not implemented: %s" % model_opt.multimodal_model_type) elif model_opt.model_type == "img": encoder = ImageEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout) elif model_opt.model_type == "audio": encoder = AudioEncoder(model_opt.enc_layers, model_opt.brnn, model_opt.rnn_size, model_opt.dropout, model_opt.sample_rate, model_opt.window_size) # Make decoder. tgt_dict = fields["tgt"].vocab feature_dicts = onmt.io.collect_feature_vocabs(fields, 'tgt') tgt_embeddings = make_embeddings(model_opt, tgt_dict, feature_dicts, for_encoder=False) # Share the embedding matrix - preprocess with share_vocab required. if model_opt.share_embeddings: # src/tgt vocab should be the same if `-share_vocab` is specified. if src_dict != tgt_dict: raise AssertionError('The `-share_vocab` should be set during ' 'preprocess if you use share_embeddings!') tgt_embeddings.word_lut.weight = src_embeddings.word_lut.weight # image features encoder if model_opt.multimodal_model_type in MODEL_TYPES: if model_opt.use_posterior_image_features: image_features_type = "posterior" image_features_projector = None elif "use_local_image_features" in vars( model_opt) and model_opt.use_local_image_features: image_features_type = "local" image_features_projector = make_encoder_image_local_features( model_opt) else: assert ( model_opt.use_global_image_features ), 'Image features type not recognised. Choose from global, posterior, local.' image_features_type = "global" image_features_projector = None if "use_local_image_features" in vars( model_opt) and model_opt.use_local_image_features: image_feats_dim = model_opt.local_image_features_dim else: image_feats_dim = model_opt.global_image_features_dim if model_opt.multimodal_model_type in MODEL_TYPES: word_dropout = model_opt.word_dropout decoder = StdRNNVIModel1Decoder( model_opt.rnn_type, model_opt.brnn, model_opt.dec_layers, model_opt.rnn_size, model_opt.global_attention, model_opt.coverage_attn, model_opt.context_gate, model_opt.copy_attn, model_opt.dropout, word_dropout, tgt_embeddings, model_opt. z_latent_dim, # additional dimensionality is z_latent_dim model_opt.reuse_copy_attn) else: raise Exception('Model %s not implemented!' % str(model_opt.multimodal_model_type)) if model_opt.multimodal_model_type in MODEL_TYPES: # if we are using a conditional model, it means we will train the variational approximation q # using all observations (x, y, v) and a generative network to predict z from x only. if model_opt.conditional: if image_features_type == 'local': # the reason to use 4 times the RNN is because we concatenate mean src encoding, mean tgt encoding, # and the result of an attention between the source and image feats, and between the target and image feats input_dims = 4 * model_opt.rnn_size else: input_dims = 2 * model_opt.rnn_size + model_opt.global_image_features_dim # this inference network uses x_1^m, y_1^n, v inf_net_global = GlobalFullInferenceNetwork( model_opt.z_latent_dim, input_dims, "normal", image_features_type=image_features_type) # use x_1^m to predict z gen_net_global = GlobalInferenceNetwork(model_opt.z_latent_dim, model_opt.rnn_size, "normal") # create bidirectional LSTM encoder to encode target sentences encoder_tgt = RNNEncoder(model_opt.rnn_type, True, model_opt.enc_layers, model_opt.rnn_size, model_opt.dropout, tgt_embeddings) # flow hidden dimension flow_h_dim = input_dims else: # use x_1^m to predict z inf_net_global = GlobalInferenceNetwork(model_opt.z_latent_dim, model_opt.rnn_size, "normal") gen_net_global = None # there is no target-language encoder encoder_tgt = None # flow hidden dimension flow_h_dim = model_opt.rnn_size # create a separate source-language encoder for the inference network encoder_inference = None if model_opt.non_shared_inference_network: #encoder_inference = make_encoder(model_opt, src_embeddings) src_embeddings_inference = make_embeddings(model_opt, src_dict, feature_dicts) encoder_inference = MeanEncoder(model_opt.enc_layers, src_embeddings_inference) if "two_step_image_prediction" in vars( model_opt) and model_opt.two_step_image_prediction: if model_opt.use_local_image_features: image_feats_dim = model_opt.local_image_features_dim else: image_feats_dim = model_opt.global_image_features_dim if model_opt.use_local_image_features: # TODO remove hard-coded parameters into `opts.py` n_channels = [500, 1000] layer_dims = [3, 5] image_size = 7 # predicting feature activations (7x7), not pixels inf_net_image_features = ImageDeconvolutionLocalFeatures( input_size=model_opt.z_latent_dim, ) # predict image pixels using output of the image features prediction (inf_net_image_features) # TODO remove hard-coded parameters into `opts.py` n_channels = [image_feats_dim, image_feats_dim // 4] layer_dims = [7, 50] image_size = 100 input_size = [2048, 7, 7] inf_net_image_pixels = ImageDeconvolution( input_size=input_size, image_size=image_size, n_channels=n_channels, n_classes=256, apply_log_softmax=True, layer_dims=layer_dims, ) else: # using global or posterior image features inf_net_image_features = ImageGlobalInferenceNetwork( model_opt.z_latent_dim, image_feats_dim, model_opt.rnn_size, False, "normal") # predict image pixels # TODO remove hard-coded parameters into `opts.py` n_channels = 3 if model_opt.use_rgb_images else 1 n_channels = [n_channels] * 2 layer_dims = [25, 50] image_size = 100 inf_net_image_pixels = ImageDeconvolution( model_opt.z_latent_dim, image_size=image_size, n_channels=n_channels) # we are predicting both image features (with image_loss == 'logprob') and image pixels (with image_loss == 'categorical') inf_net_image = (inf_net_image_features, inf_net_image_pixels) else: # we are only predicting either image features (image_loss != 'categorical') or image pixels (image_loss == 'categorical') if model_opt.image_loss != 'categorical': print("Creating image inference network") if model_opt.use_global_image_features or model_opt.use_posterior_image_features: inf_net_image = ImageGlobalInferenceNetwork( model_opt.z_latent_dim, model_opt.global_image_features_dim, model_opt.rnn_size, False, "normal") elif model_opt.use_local_image_features: # TODO remove hard-coded parameters into `opts.py` n_channels = [500, 1000] layer_dims = [3, 5] image_size = 7 # predicting feature activations (7x7), not pixels inf_net_image = ImageDeconvolutionLocalFeatures( input_size=model_opt.z_latent_dim, ) else: raise Exception("Image features type not recognised.") print(inf_net_image) else: # TODO remove hard-coded parameters into `opts.py` n_channels = 3 if model_opt.use_rgb_images else 1 n_channels = [n_channels] * 2 image_size = 100 inf_net_image = ImageDeconvolution(model_opt.z_latent_dim, image_size=image_size, n_channels=n_channels) else: raise Exception('Model %s not implemented!' % str(model_opt.multimodal_model_type)) # Make NMTModel(= encoder + decoder). model = NMTVIModel( encoder, decoder, encoder_inference=encoder_inference, inf_net_global=inf_net_global, gen_net_global=gen_net_global, inf_net_image=inf_net_image, multimodal_model_type='vi-model1', image_loss_type=model_opt.image_loss, image_features_type=image_features_type, image_features_projector=image_features_projector, two_step_image_prediction=model_opt.two_step_image_prediction if "two_step_image_prediction" in vars(model_opt) else False, conditional=model_opt.conditional, encoder_tgt=encoder_tgt) model.model_type = model_opt.model_type # Make Generator. if not model_opt.copy_attn: generator = nn.Sequential( nn.Linear(model_opt.rnn_size, len(fields["tgt"].vocab)), nn.LogSoftmax()) if model_opt.share_decoder_embeddings: generator[0].weight = decoder.embeddings.word_lut.weight else: generator = CopyGenerator(model_opt.rnn_size, fields["tgt"].vocab) # Load the model states from checkpoint or initialize them. if checkpoint is not None: print('Loading model parameters.') model.load_state_dict(checkpoint['model']) generator.load_state_dict(checkpoint['generator']) else: if model_opt.param_init != 0.0: print('Initializing model parameters.') for p in model.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) for p in generator.parameters(): p.data.uniform_(-model_opt.param_init, model_opt.param_init) if hasattr(model.encoder, 'embeddings'): model.encoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_enc, model_opt.fix_word_vecs_enc) if hasattr(model.decoder, 'embeddings'): model.decoder.embeddings.load_pretrained_vectors( model_opt.pre_word_vecs_dec, model_opt.fix_word_vecs_dec) # Add generator to model (this registers it as parameter of model). model.generator = generator # Make the whole model leverage GPU if indicated to do so. if gpu: model.cuda() else: model.cpu() return model