def __init__(self, config, optimizer, loss_object): # General parameters self.config = config # mr_vocab_size = len(self.config.mr_lang.word_index) #self.config.mr_lang.word_index['UNK'] = mr_vocab_size + 1 self.config.mr_lang.word_index['UNK'] = None self.optimizer = optimizer self.loss_object = loss_object self.dataset = create_tensor(mr_tensor=config.mr_tensor, nl_tensor=config.nl_tensor, buffer_size=config.buffer_size, batch_size=config.batch_size) # Encoder self.encoder = Encoder(config.vocab_mr_size, config.embedding_dim, config.units, config.batch_size) # Decoder self.decoder = DecoderBeam(config.vocab_nl_size, config.embedding_dim, config.units, config.batch_size, config.beam_size, config.nl_lang.word_index, config.nl_lang.index_word, config.pointer_generator) self.reranker = ReRankerBase(config.reranker_type, config.gazetteer_reranker) # Checkpoint self.checkpoint_prefix = os.path.join(config.checkpoint_dir, "ckpt") self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, encoder=self.encoder, decoder=self.decoder)
def inference(self): """ this is for fine-tuning. main inference logic here: invoke transformer model to do inference,input is a sequence, output is also a sequence, get representation of masked token(s) and use a classifier to train the model. # idea of the hidden state of masked position(s): # 1) a batch of position index, 2) one hot it, multiply with total sequence represenation, 3)every where is 0 for the second dimension(sequence_length), 4) only one place is 1, 5) thus we can sum up without loss any information. :return: """ # 1. input representation(input embedding, positional encoding, segment encoding) token_embeddings = tf.nn.embedding_lookup(self.embedding,self.input_x) # [batch_size,sequence_length,embed_size] self.input_representation=tf.add(tf.add(token_embeddings,self.segment_embeddings_lm),self.position_embeddings) # [batch_size,sequence_length,embed_size] # 2. repeat Nx times of building block( multi-head attention followed by Add & Norm; feed forward followed by Add & Norm) encoder_class=Encoder(self.d_model,self.d_k,self.d_v,self.sequence_length,self.h,self.batch_size,self.num_layer,self.input_representation, self.input_representation,dropout_keep_prob=self.dropout_keep_prob,use_residual_conn=self.use_residual_conn) h= encoder_class.encoder_fn() # [batch_size,sequence_length,d_model] # 3. get hidden state of token of [cls], and project it to make a predict. h_cls=h[:,0,:] # [batch_size,d_model] # 4. project representation of masked token(s) to vocab size with tf.variable_scope("fine_tuning"): logits = tf.layers.dense(h_cls, self.num_classes) # shape:[None,self.vocab_size] logits = tf.nn.dropout(logits,keep_prob=self.dropout_keep_prob) # shape:[None,self.num_classes] return logits # shape:[None,self.num_classes]
def build_model(self): self.Encoder = Encoder(self.in_channels, self.storing_channels, self.nf).to(self.device) self.Decoder = Decoder(self.storing_channels, self.in_channels, self.nf).to(self.device) self.Disciminator = Discriminator().to(self.device) self.load_model()
def inference_lm(self): """ this is for pre-trained language model. main inference logic here: invoke transformer model to do inference,input is a sequence, output is also a sequence, get representation of masked token(s) and use a classifier to train the model. # idea of the hidden state of masked position(s): # 1) a batch of position index, 2) one hot it, multiply with total sequence represenation, 3)every where is 0 for the second dimension(sequence_length), 4) only one place is 1, 5) thus we can sum up without loss any information. :return: """ # 1. input representation(input embedding, positional encoding, segment encoding) token_embeddings = tf.nn.embedding_lookup(self.embedding,self.x_mask_lm) # [batch_size,sequence_length,embed_size] self.input_representation_lm=tf.add(tf.add(token_embeddings,self.segment_embeddings_lm),self.position_embeddings_lm) # [batch_size,sequence_length,embed_size] # 2. repeat Nx times of building block( multi-head attention followed by Add & Norm; feed forward followed by Add & Norm) encoder_class=Encoder(self.d_model,self.d_k,self.d_v,self.sequence_length_lm,self.h,self.batch_size,self.num_layer,self.input_representation_lm, self.input_representation_lm,dropout_keep_prob=self.dropout_keep_prob,use_residual_conn=self.use_residual_conn) h_lm = encoder_class.encoder_fn() # [batch_size,sequence_length,d_model] # 3. get last hidden state of the masked position(s), and project it to make a predict. p_mask_lm_onehot=tf.one_hot(self.p_mask_lm,self.sequence_length_lm) # [batch_size, sequence_length_lm] p_mask_lm_expand=tf.expand_dims(p_mask_lm_onehot,axis=-1) # # [batch_size, sequence_length_lm,1] h_lm_multiply=tf.multiply(h_lm,p_mask_lm_expand) # [batch_size,sequence_length,d_model] h_lm_representation=tf.reduce_sum(h_lm_multiply,axis=1) # batch_size,d_model]. # 4. project representation of masked token(s) to vocab size with tf.variable_scope("pre_training"): logits_lm = tf.layers.dense(h_lm_representation, self.vocab_size) # shape:[None,self.vocab_size] logits_lm = tf.nn.dropout(logits_lm,keep_prob=self.dropout_keep_prob) # shape:[None,self.num_classes] return logits_lm # shape:[None,self.num_classes]
def __init__(self, vocab_size_src: int, vocab_size_trg: int, eos_int: int, sos_int: int, dim_embed_src: int = 512, src_map_i2c: Union[Dict[int, str], None] = None, trg_map_i2c: Union[Dict[int, str], None] = None, dim_embed_trg: int = 512, num_neurons_encoder: int = 512, num_neurons_decoder: int = 512, optim: object = GradientDescentMomentum): assert dim_embed_trg == num_neurons_decoder, ( "For weight tying, the number of neurons in the decoder has to be" + "the same as the number of dimensions in the embedding") # These don't have to be equal. If they aren't, you will need an # additional weight matrix after the encoder to project down or # up to the dimensionality of the decoder, adding extra complexity. # Kept symmetric for simplicities sake assert num_neurons_decoder == num_neurons_encoder, ( "Currently this model only supports symmetric decoders and encoders" ) self.src_dim = vocab_size_src self.trg_map_i2c = trg_map_i2c self.src_map_i2c = src_map_i2c self.optim = optim self.encoder = Encoder(vocab_size_src, dim_embed_src, num_neurons_encoder, optim) self.decoder = Decoder(vocab_size_trg, dim_embed_trg, num_neurons_decoder, optim) self.eos_int = eos_int self.sos_int = sos_int
def setup_model(args): """Sets up the model""" model = Encoder(args.num_classes).to(DEVICE) optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) loss_fn = nn.CrossEntropyLoss(ignore_index=args.ignore_index).to(DEVICE) return model, optimizer, loss_fn
def __init__(self): super(Tacotron2, self).__init__() self.embedding = nn.Embedding(hps.n_symbols, hps.character_embedding_dim) std = sqrt(2.0 / (hps.n_symbols + hps.character_embedding_dim)) val = sqrt(3.0) * std self.embedding.weight.data.uniform_(-val, val) self.encoder = Encoder() self.decoder = Decoder() self.postnet = PostNet()
def _resolve_mode(self, config): self.random_encoder_output_size = config['random_encoder']['output_size'] if config['random_encoder']['output_size'] > 0 and config['real_encoder']['output_size'] > 0: self.mode = HyperPocket() self.random_encoder = Encoder(config['random_encoder'], is_vae=True) self.real_encoder = Encoder(config['real_encoder'], is_vae=False) elif config['random_encoder']['output_size'] > 0: self.mode = HyperCloud() self.random_encoder = Encoder(config['random_encoder'], is_vae=True) elif config['real_encoder']['output_size'] > 0: self.mode = HyperRec() self.real_encoder = Encoder(config['real_encoder'], is_vae=False) else: raise ValueError("at least one encoder should have non zero output")
def __init__(self, embedding_input, input_mask, embedding_question, vocab_size, max_mask_length, params): self.embedding_input = embedding_input self.input_mask = input_mask self.embedding_question = embedding_question self.vocab_size = vocab_size self.params = params self.encoder = Encoder(encoder_type=params.model.encoder_type, num_layers=params.model.num_layers, cell_type=params.model.cell_type, num_units=params.model.num_units, dropout=params.model.dropout) self.max_mask_length = max_mask_length self.output = self.inference()
def __init__(self, config: HiDDenConfiguration, noiser: Noiser): super(EncoderDecoder, self).__init__() self.encoder = Encoder(config) self.noiser = noiser self.decoder = Decoder(config)
def __init__(self, args): super(Seq2Seq, self).__init__() self.beam = args.beam self.encoder = Encoder(args) self.decoder = Decoder(args)
def __init__(self, num_encoder, num_decoder, d_model, num_heads, dff, inp_max_seq_len, tar_max_seq_len, inp_vocab_size, tar_vocab_size, rate=0.1): super(Transformer, self).__init__() encoder_params = { "num_encoder": num_encoder, "num_heads": num_heads, "d_model": d_model, "dff": dff, "max_seq_len": inp_max_seq_len, "vocab_size": inp_vocab_size, "rate": rate } self.encoder = Encoder(**encoder_params) decoder_params = { "num_decoder": num_decoder, "num_heads": num_heads, "d_model": d_model, "dff": dff, "max_seq_len": tar_max_seq_len, "vocab_size": tar_vocab_size, "rate": rate } self.decoder = Decoder(**decoder_params) self.final_dense = keras.layers.Dense(tar_vocab_size)
def __init__(self, args): super(PointerNet, self).__init__() self.emb_dp = args.input_drop_ratio self.model_dp = args.drop_ratio self.d_emb = args.d_emb self.sen_enc_type = args.senenc self.src_embed = nn.Embedding(args.doc_vocab, self.d_emb) h_dim = args.d_rnn d_mlp = args.d_mlp # sentence encoder self.sen_enc = nn.LSTM(self.d_emb, args.d_rnn // 2, bidirectional=True, batch_first=True) selfatt_layer = EncoderLayer(h_dim, 4, 512, args.attdp) self.encoder = Encoder(selfatt_layer, args.gnnl) self.decoder = nn.LSTM(h_dim, h_dim, batch_first=True) # pointer net self.linears = nn.ModuleList([ nn.Linear(h_dim, d_mlp, False), nn.Linear(h_dim, d_mlp, False), nn.Linear(d_mlp, 1, False) ]) self.critic = None labelemb_dim = args.d_label d_pair = args.d_pair self.lamb = args.lamb_rela # future ffn self.future = nn.Sequential(nn.Linear(h_dim * 2, h_dim * 2, False), nn.ReLU(), nn.Dropout(0.1), nn.Linear(h_dim * 2, d_pair, False), nn.ReLU(), nn.Dropout(0.1)) self.w3 = nn.Linear(d_pair, 2, False) self.hist_left1 = nn.Sequential(nn.Linear(h_dim * 2, h_dim * 2, False), nn.ReLU(), nn.Dropout(0.1), nn.Linear(h_dim * 2, d_pair, False), nn.ReLU(), nn.Dropout(0.1)) # for sind, l2 half dim self.hist_left2 = nn.Sequential(nn.Linear(h_dim * 2, h_dim * 2, False), nn.ReLU(), nn.Dropout(0.1), nn.Linear(h_dim * 2, d_pair, False), nn.ReLU(), nn.Dropout(0.1)) self.wleft1 = nn.Linear(d_pair, 2, False) self.wleft2 = nn.Linear(d_pair, 2, False) # new key d_pair_posi = d_pair + labelemb_dim self.pw_k = nn.Linear(d_pair_posi * 4, h_dim, False) self.pw_e = nn.Linear(h_dim, 1, False)
def __init__(self, embedding_info: Dict, encoder_info: Dict, decoder_info: Dict, hidden_states: Dict, token_to_id: Dict, type_to_id: Dict, label_to_id: Dict): super().__init__() self.embedding_info = embedding_info self.encoder_info = encoder_info self.decoder_info = decoder_info self.hidden_states = hidden_states self.token_to_id = token_to_id self.type_to_id = type_to_id self.label_to_id = label_to_id self.embedding = Embedding(h_emb=self.hidden_states['embedding'], token_to_id=self.token_to_id, type_to_id=self.type_to_id, **self.embedding_info) self.encoder = Encoder(h_emb=self.hidden_states['embedding'], h_enc=self.hidden_states['encoder'], **self.encoder_info) self.decoder = Decoder(h_enc=self.hidden_states['encoder'], h_dec=self.hidden_states['decoder'], label_to_id=self.label_to_id, **self.decoder_info)
def __init__(self, enc_chs=(1, 32, 64, 128, 256, 512), n_domains=2): super().__init__() self.encoder = Encoder(enc_chs) self.head = nn.Sequential( nn.Conv2d(1024, 512, kernel_size=3, padding=1, bias=False), nn.BatchNorm2d(512), nn.ReLU(), nn.MaxPool2d(2), View(32768), nn.Linear(32768, n_domains))
def build_transformer(source_vocab, target_vocab, trg_pad_idx, src_pad_idx, num_layers=6, num_attention_layers=8, d_model=512, d_ff=2048, dropout=0.1): # we can do a shared vocab here to share the weights for two embeddings and outputgenerator projection positional_encoder = PositionalEncoder(d_model, dropout) encoder = Encoder(num_layers, num_attention_layers, d_model, d_ff, dropout=dropout) decoder = Decoder(num_layers, num_attention_layers, d_model, d_ff, dropout=dropout) source_embedding = nn.Sequential( Embedder(source_vocab, d_model, src_pad_idx), copy.deepcopy(positional_encoder)) target_embedding = nn.Sequential( Embedder(target_vocab, d_model, trg_pad_idx), copy.deepcopy(positional_encoder)) generator = OutputGenerator(d_model, target_vocab) model = Transformer(encoder, decoder, generator, source_embedding, target_embedding, trg_pad_idx, src_pad_idx) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform(p) return model
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): c = copy.deepcopy attn = MultiHeadedAttention(h, d_model).to(args.device) ff = PositionwiseFeedForward(d_model, d_ff, dropout).to(args.device) position = PositionalEncoding(d_model, dropout).to(args.device) model = Transformer( Encoder( EncoderLayer(d_model, c(attn), c(ff), dropout).to(args.device), N).to(args.device), Decoder( DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout).to(args.device), N).to(args.device), nn.Sequential( Embeddings(d_model, src_vocab).to(args.device), c(position)), nn.Sequential( Embeddings(d_model, tgt_vocab).to(args.device), c(position)), Generator(d_model, tgt_vocab)).to(args.device) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model.to(args.device)
def _initialize_layers(self, input_data_size, layer_list=None): prev_output_size = input_data_size for ind, layer_size in enumerate(layer_list): # Create an Encoder and Decoder for each element in the layer list. # For encoders, output size is the current element of layer list. # For decoders, output size is equal to the *input* size of the Encoder at this layer. # because Decoder(Encoder(data)) == data self.encode_layers.append( Encoder(layer_size, name="Encoder_Layer_" + str(ind))) if ind == 0: self.decode_layers.append( Decoder(prev_output_size, name="Decoder_Layer_" + str(ind), end=True)) else: self.decode_layers.append( Decoder(prev_output_size, name="Decoder_Layer_" + str(ind))) self.layer_sizes.append((prev_output_size, layer_size)) # Build checkpoint directories for each layer. cpt_dirname = FLAGS.checkpoint_dir + self.get_layer_checkpoint_dirname( ind) if not path.isdir(cpt_dirname): makedirs(cpt_dirname) prev_output_size = layer_size
def __init__(self, src_pad_idx, trg_pad_idx, trg_sos_idx, enc_voc_size, dec_voc_size, d_model, n_head, max_len, ffn_hidden, n_layers, drop_prob, device): super().__init__() self.src_pad_idx = src_pad_idx self.trg_pad_idx = trg_pad_idx self.trg_sos_idx = trg_sos_idx self.device = device self.encoder = Encoder(d_model=d_model, n_head=n_head, max_len=max_len, ffn_hidden=ffn_hidden, enc_voc_size=enc_voc_size, drop_prob=drop_prob, n_layers=n_layers, device=device) self.decoder = Decoder(d_model=d_model, n_head=n_head, max_len=max_len, ffn_hidden=ffn_hidden, dec_voc_size=dec_voc_size, drop_prob=drop_prob, n_layers=n_layers, device=device)
def __init__(self, activate_encoder): super(Tacotron3, self).__init__() if activate_encoder: self.encoder = Encoder() else: self.encoder = SimpleEncoder() self.decoder = Decoder(activate_encoder) self.postnet = Postnet()
def __init__(self, encoder_cfg=None, decoder_cfg=None, with_att=False): super(Model, self).__init__() self.encoder = Encoder(**encoder_cfg) if with_att: from model.decoder_att import Decoder else: from model.decoder import Decoder self.decoder = Decoder(**decoder_cfg)
def __init__(self, latent_dim: int = 128, image_channels: int = 3): super().__init__() self.latent_dim = latent_dim self.encoder = Encoder(in_channels=image_channels, latent_dim=latent_dim) self.decoder = Decoder(latent_dims=latent_dim, out_channels=image_channels, image_size=(72, 72))
def __init__(self, embed_dims, num_chars, encoder_dims, decoder_dims, n_mels, fft_bins, postnet_dims, encoder_K, lstm_dims, postnet_K, num_highways, dropout, speaker_latent_dims, speaker_encoder_dims, n_speakers, noise_latent_dims, noise_encoder_dims): super().__init__() self.n_mels = n_mels self.lstm_dims = lstm_dims self.decoder_dims = decoder_dims # Standard Tacotron ############################################################# self.encoder = Encoder(embed_dims, num_chars, encoder_dims, encoder_K, num_highways, dropout) self.encoder_proj = nn.Linear(decoder_dims, decoder_dims, bias=False) self.decoder = Decoder(n_mels, decoder_dims, lstm_dims, speaker_latent_dims, noise_latent_dims) self.postnet = CBHG(postnet_K, n_mels + noise_latent_dims, postnet_dims, [256, n_mels + noise_latent_dims], num_highways) self.post_proj = nn.Linear(postnet_dims * 2, fft_bins, bias=False) # VAE Domain Adversarial ######################################################## if hp.encoder_model == "CNN": self.speaker_encoder = CNNEncoder(n_mels, speaker_latent_dims, speaker_encoder_dims) self.noise_encoder = CNNEncoder(n_mels, noise_latent_dims, noise_encoder_dims) elif hp.encoder_model == "CNNRNN": self.speaker_encoder = CNNRNNEncoder(n_mels, speaker_latent_dims, speaker_encoder_dims) self.noise_encoder = CNNRNNEncoder(n_mels, noise_latent_dims, noise_encoder_dims) self.speaker_speaker = Classifier(speaker_latent_dims, n_speakers) self.speaker_noise = Classifier(speaker_latent_dims, 2) self.noise_speaker = Classifier(noise_latent_dims, n_speakers) self.noise_noise = Classifier(noise_latent_dims, 2) ## speaker encoder prior self.speaker_latent_loc = nn.Parameter( torch.zeros(speaker_latent_dims), requires_grad=False) self.speaker_latent_scale = nn.Parameter( torch.ones(speaker_latent_dims), requires_grad=False) self.speaker_latent_prior = dist.Independent( dist.Normal(self.speaker_latent_loc, self.speaker_latent_scale), 1) ## noise encoder prior self.noise_latent_loc = nn.Parameter(torch.zeros(noise_latent_dims), requires_grad=False) self.noise_latent_scale = nn.Parameter(torch.ones(noise_latent_dims), requires_grad=False) self.noise_latent_prior = dist.Independent( dist.Normal(self.noise_latent_loc, self.noise_latent_scale), 1) ################################################################################# self.init_model() self.num_params() self.register_buffer("step", torch.zeros(1).long()) self.register_buffer("r", torch.tensor(0).long())
def inference(self): """ main inference logic here: invoke transformer model to do inference. input is a sequence, output is also a sequence. input representation--> :return: """ # 1. input representation(input embedding, positional encoding, segment encoding) token_embeddings = tf.nn.embedding_lookup(self.embedding,self.input_x) # [batch_size,sequence_length,embed_size] self.input_representation=tf.add(tf.add(token_embeddings,self.segment_embeddings),self.position_embeddings) # [batch_size,sequence_length,embed_size] # 2. repeat Nx times of building block( multi-head attention followed by Add & Norm; feed forward followed by Add & Norm) encoder_class=Encoder(self.d_model,self.d_k,self.d_v,self.sequence_length,self.h,self.batch_size,self.num_layer,self.input_representation, self.input_representation,dropout_keep_prob=self.dropout_keep_prob,use_residual_conn=self.use_residual_conn) h = encoder_class.encoder_fn() # [batch_size,sequence_length,d_model] # 3. get logits for different tasks by applying projection layer logits=self.project_tasks(h) # shape:[None,self.num_classes] return logits # shape:[None,self.num_classes]
def _make_model(self): # embedding embedding = nn.Embedding(num_embeddings=self._config.vocab_size, embedding_dim=self._config.embed_size) embedding.weight.data.copy_( torch.from_numpy(np.load(self._config.embedding_file_name))) embedding.weight.requires_grad = False # encoder encoder = Encoder(rnn_type=self._config.rnn_type, embed_size=self._config.embed_size, hidden_size=self._config.hidden_size, num_layers=self._config.num_layers, bidirectional=self._config.bidirectional, dropout=self._config.dropout) # birdge bridge = Bridge(rnn_type=self._config.rnn_type, hidden_size=self._config.hidden_size, bidirectional=self._config.bidirectional) # decoder rnn cell if self._config.rnn_type == 'LSTM': rnn_cell = MultiLayerLSTMCells( input_size=2 * self._config.embed_size, hidden_size=self._config.hidden_size, num_layers=self._config.num_layers, dropout=self._config.dropout) else: rnn_cell = MultiLayerGRUCells(input_size=2 * self._config.embed_size, hidden_size=self._config.hidden_size, num_layers=self._config.num_layers, dropout=self._config.dropout) # attention if self._config.attention_type == 'Dot': attention = DotAttention() elif self._config.attention_type == 'ScaledDot': attention = ScaledDotAttention() elif self._config.attention_type == 'Additive': attention = AdditiveAttention(query_size=self._config.hidden_size, key_size=self._config.hidden_size) elif self._config.attention_type == 'Multiplicative': attention = MultiplicativeAttention( query_size=self._config.hidden_size, key_size=self._config.hidden_size) elif self._config.attention_type == 'MLP': attention = MultiLayerPerceptronAttention( query_size=self._config.hidden_size, key_size=self._config.hidden_size, out_size=1) else: raise ValueError('No Supporting.') # decoder decoder = Decoder(embedding, rnn_cell, attention, self._config.hidden_size) # model model = Seq2Seq(embedding, encoder, bridge, decoder) return model
def __init__(self, config): super().__init__() self.decoder = Decoder.build({**config["decoder"], "rename": True}) self.soft_mem_mask = config["decoder"]["mem_mask"] == "soft" if self.soft_mem_mask: self.mem_encoder = Encoder.build(config["mem_encoder"]) self.mem_decoder = Decoder.build(config["mem_decoder"]) self.decoder.mem_encoder = self.mem_encoder self.decoder.mem_decoder = self.mem_decoder self.beam_size = config["test"]["beam_size"]
def __init__(self): super(Model, self).__init__() self.encoder = Encoder() self.decoder = Decoder() self.embeds = nn.Embedding(config.vocab_size, config.emb_dim) tools.init_wt_normal(self.embeds.weight) self.encoder = get_cuda(self.encoder) self.decoder = get_cuda(self.decoder) self.embeds = get_cuda(self.embeds)
def __init__(self, config): super().__init__() self.decoder = Decoder.build({**config["decoder"]}) self.subtype = config["decoder"]["type"] in ["XfmrSubtypeDecoder"] self.soft_mem_mask = config["decoder"]["mem_mask"] == "soft" if self.soft_mem_mask: self.mem_encoder = Encoder.build(config["mem_encoder"]) self.mem_decoder = Decoder.build(config["mem_decoder"]) self.decoder.mem_encoder = self.mem_encoder self.decoder.mem_decoder = self.mem_decoder self.beam_size = config["test"]["beam_size"]
def __init__(self, config: HiDDenConfiguration, noiser: Noiser, apply_quantization: bool = False): # TODO: consider adding quantization after the noiser if the parameter value us true # TODO: consider making quantization part of noise configuration super(EncoderDecoder, self).__init__() self.encoder = Encoder(config) self.noiser = noiser # TODO: we were passing device to Noiser self.decoder = Decoder(config)
def initialize(self, batch): ''' param: batch: Batch object ''' with tf.variable_scope('inference') as scope: linear_targets = batch._lin_targets is_training = linear_targets is not None batch_size = batch.get_size() # Encoder encoder = Encoder(is_training=is_training) encoder_outputs = encoder.encode(batch.get_embedds(), batch.get_input_lengths()) # Decoder if is_training: helper = TrainingHelper(batch.get_inputs(), batch.get_mel_targets(), self._hparams.num_mels, self._hparams.outputs_per_step) else: helper = TestingHelper(batch_size, self._hparams.num_mels, self._hparams.outputs_per_step) decoder = Decoder(helper, is_training=is_training) mel_outputs, lin_outputs, final_decoder_state = decoder.decode( encoder_outputs, batch_size) # Alignments alignments = tf.transpose( final_decoder_state[0].alignment_history.stack(), [1, 2, 0]) self.inputs, self.input_lengths, self.mel_targets, self.linear_targets = batch.get_all( ) self.mel_outputs = mel_outputs self.linear_outputs = lin_outputs self.alignments = alignments self.global_step = tf.Variable(0, name='global_step', trainable=False)