def __init__(self): self.data = PrepareData() self.dataset = Seq2SeqDataset() self.data_loader = DataLoader(dataset=self.dataset, batch_size=1, shuffle=True) self.lang_1 = data.lang_1 self.lang_2 = data.lang_2 self.char2index = data.char2index self.index2char = data.index2char self.input_size = 100 self.hidden_size = 64 self.output_size = 100 self.learning_rate = 0.01 self.num_epoch = 500 self.teacher_forcing = True self.use_cuda = torch.cuda.is_available() self.device = 'cuda:0' if self.use_cuda else 'cpu' self.encoder = EncoderRNN(input_size=self.input_size, hidden_size=self.hidden_size) self.decoder = DecoderRNN(output_size=self.output_size, hidden_size=self.hidden_size) self.attn_decoder = AttnDecoder(self.hidden_size, self.output_size) if use_cuda: self.encoder = self.encoder.to(self.device) self.decoder = self.decoder.to(self.device) self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), lr=self.learning_rate) self.decoder_optimizer = torch.optim.Adam(self.decoder.parameters(), lr=self.learning_rate) self.loss_function = nn.NLLLoss()
def parse_options(): parser = argparse.ArgumentParser() Train.add_parse_options(parser) Encoder.add_parse_options(parser) AttnDecoder.add_parse_options(parser) Seq2SeqModel.add_parse_options(parser) LMModel.add_parse_options(parser) BeamSearch.add_parse_options(parser) parser.add_argument("-dev", default=False, action="store_true", help="Get dev set results using the last saved model") parser.add_argument("-test", default=False, action="store_true", help="Get test results using the last saved model") args = parser.parse_args() args = vars(args) return process_args(args)
def class_params(cls): params = Bunch() # Task specification params['tasks'] = ['char'] params['num_layers'] = {'char': 4} params['max_output'] = {'char': 120} # Optimization params params['learning_rate'] = 1e-3 params['learning_rate_decay_factor'] = 0.5 params['max_gradient_norm'] = 5.0 # Loss params params['avg'] = True params['encoder_params'] = Encoder.class_params() params['decoder_params'] = {'char': AttnDecoder.class_params()} return params
def __init__(self, data_iter, isTraining=True, params=None): """Initializer of class that defines the computational graph. Args: encoder: Encoder object executed via encoder(args) decoder: Decoder object executed via decoder(args) """ if params is None: self.params = self.class_params() else: self.params = params params = self.params self.encoder = Encoder(isTraining=isTraining, params=params.encoder_params) self.decoder = {} for task in params.tasks: self.decoder[task] = AttnDecoder( isTraining=isTraining, params=params.decoder_params[task], scope=task) self.data_iter = data_iter self.isTraining = isTraining self.learning_rate = tf.Variable(float(params.learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * params.learning_rate_decay_factor) # Number of gradient updates performed self.global_step = tf.Variable(0, trainable=False) # Number of epochs done self.epoch = tf.Variable(0, trainable=False) self.epoch_incr = self.epoch.assign(self.epoch + 1) self.create_computational_graph()
def __init__(self, buckets, isTraining, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, encoder_attribs, decoder_attribs): """Initializer of class that defines the computational graph. Args: buckets: List of input-output sizes that limit the amount of sequence padding (http://goo.gl/d8ybpl). isTraining: boolean that denotes training v/s evaluation. max_gradient_norm: Maximum value of gradient norm. batch_size: Minibatch size used for doing SGD. learning_rate: Initial learning rate of optimizer learning_rate_decay_factor: Multiplicative learning rate decay factor {encoder, decoder}_attribs: Dictionary containing attributes for {encoder, decoder} RNN. """ self.buckets = buckets self.isTraining = isTraining self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) # Number of gradient updates performed self.global_step = tf.Variable(0, trainable=False) # Number of epochs done self.epoch = tf.Variable(0, trainable=False) self.epoch_incr = self.epoch.assign(self.epoch + 1) # Placeholder for encoder input IDs - Shape TxB self.encoder_inputs = tf.placeholder(tf.int32, shape=[None, None], name='encoder') _batch_size = self.encoder_inputs.get_shape()[1].value # Input sequence length placeholder self.seq_len = tf.placeholder(tf.int32, shape=[_batch_size], name="seq_len") # Output sequence length placeholder self.seq_len_target = tf.placeholder(tf.int32, shape=[_batch_size], name="seq_len_target") # Input to decoder RNN. This input has an initial extra symbol - GO - # that initiates the decoding process. self.decoder_inputs = tf.placeholder(tf.int32, shape=[None, None], name="decoder") # Targets are decoder inputs shifted by one thus, ignoring GO symbol self.targets = tf.slice(self.decoder_inputs, [1, 0], [-1, -1]) # Initialize the encoder and decoder RNNs self.encoder = Encoder(isTraining, **encoder_attribs) if decoder_attribs['simp_decoder']: self.decoder = SimpleDecoder(isTraining, **decoder_attribs) else: self.decoder = AttnDecoder(isTraining, **decoder_attribs) # First encode input self.encoder_hidden_states, self.final_state = \ self.encoder.encode_input(self.encoder_inputs, self.seq_len) # Then decode self.outputs = \ self.decoder.decode(self.decoder_inputs, self.seq_len_target, self.encoder_hidden_states, self.final_state, self.seq_len) # Training outputs and losses. self.losses = self.seq2seq_loss(self.outputs, self.targets, self.seq_len_target) if isTraining: # Gradients and parameter updation for training the model. params = tf.trainable_variables() print("\nModel parameters:\n") for var in params: print(("{0}: {1}").format(var.name, var.get_shape())) print # Initialize optimizer opt = tf.train.AdamOptimizer(self.learning_rate) # Get gradients from loss gradients = tf.gradients(self.losses, params) # Clip the gradients to avoid the problem of gradient explosion # possible early in training clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms = norm # Apply gradients self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) # Model saver function self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)
class Seq2SeqModel(object): """Implements the Encoder-Decoder model.""" def __init__(self, buckets, isTraining, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, encoder_attribs, decoder_attribs): """Initializer of class that defines the computational graph. Args: buckets: List of input-output sizes that limit the amount of sequence padding (http://goo.gl/d8ybpl). isTraining: boolean that denotes training v/s evaluation. max_gradient_norm: Maximum value of gradient norm. batch_size: Minibatch size used for doing SGD. learning_rate: Initial learning rate of optimizer learning_rate_decay_factor: Multiplicative learning rate decay factor {encoder, decoder}_attribs: Dictionary containing attributes for {encoder, decoder} RNN. """ self.buckets = buckets self.isTraining = isTraining self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) # Number of gradient updates performed self.global_step = tf.Variable(0, trainable=False) # Number of epochs done self.epoch = tf.Variable(0, trainable=False) self.epoch_incr = self.epoch.assign(self.epoch + 1) # Placeholder for encoder input IDs - Shape TxB self.encoder_inputs = tf.placeholder(tf.int32, shape=[None, None], name='encoder') _batch_size = self.encoder_inputs.get_shape()[1].value # Input sequence length placeholder self.seq_len = tf.placeholder(tf.int32, shape=[_batch_size], name="seq_len") # Output sequence length placeholder self.seq_len_target = tf.placeholder(tf.int32, shape=[_batch_size], name="seq_len_target") # Input to decoder RNN. This input has an initial extra symbol - GO - # that initiates the decoding process. self.decoder_inputs = tf.placeholder(tf.int32, shape=[None, None], name="decoder") # Targets are decoder inputs shifted by one thus, ignoring GO symbol self.targets = tf.slice(self.decoder_inputs, [1, 0], [-1, -1]) # Initialize the encoder and decoder RNNs self.encoder = Encoder(isTraining, **encoder_attribs) if decoder_attribs['simp_decoder']: self.decoder = SimpleDecoder(isTraining, **decoder_attribs) else: self.decoder = AttnDecoder(isTraining, **decoder_attribs) # First encode input self.encoder_hidden_states, self.final_state = \ self.encoder.encode_input(self.encoder_inputs, self.seq_len) # Then decode self.outputs = \ self.decoder.decode(self.decoder_inputs, self.seq_len_target, self.encoder_hidden_states, self.final_state, self.seq_len) # Training outputs and losses. self.losses = self.seq2seq_loss(self.outputs, self.targets, self.seq_len_target) if isTraining: # Gradients and parameter updation for training the model. params = tf.trainable_variables() print("\nModel parameters:\n") for var in params: print(("{0}: {1}").format(var.name, var.get_shape())) print # Initialize optimizer opt = tf.train.AdamOptimizer(self.learning_rate) # Get gradients from loss gradients = tf.gradients(self.losses, params) # Clip the gradients to avoid the problem of gradient explosion # possible early in training clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms = norm # Apply gradients self.updates = opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step) # Model saver function self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=2) @staticmethod def seq2seq_loss(logits, targets, seq_len_target): """Calculate the cross entropy loss w.r.t. given target. Args: logits: A 2-d tensor of shape (TxB)x|V| containing the logit score per output symbol. targets: 2-d tensor of shape TxB that contains the ground truth output symbols. seq_len_target: Sequence length of output sequences. Required to mask padding symbols in output sequences. """ with ops.name_scope("sequence_loss", [logits, targets]): flat_targets = tf.reshape(targets, [-1]) cost = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=flat_targets) # Mask this cost since the output sequence is padded batch_major_mask = tf.sequence_mask(seq_len_target, dtype=tf.float32) time_major_mask = tf.transpose(batch_major_mask, [1, 0]) weights = tf.reshape(time_major_mask, [-1]) mask_cost = weights * cost loss = tf.reshape(mask_cost, tf.shape(targets)) # Average the loss for each example by the # of timesteps cost_per_example = tf.reduce_sum(loss, reduction_indices=0) /\ tf.cast(seq_len_target, tf.float32) # Return the average cost over all examples return tf.reduce_mean(cost_per_example) def step(self, sess, encoder_inputs, seq_len, decoder_inputs, seq_len_target): """Perform 1 minibatch update/evaluation. Args: sess: Tensorflow session where computation graph is created encoder_inputs: List of a minibatch of input IDs seq_len: Input sequence length decoder_inputs: List of a minibatch of output IDs seq_len_target: Output sequence length Returns: Output of a minibatch updated. The exact output depends on whether the model is in training mode or evaluation mode. """ # Pass inputs via feed dict method input_feed = {} input_feed[self.encoder_inputs.name] = encoder_inputs input_feed[self.decoder_inputs.name] = decoder_inputs input_feed[self.seq_len.name] = seq_len input_feed[self.seq_len_target.name] = seq_len_target if self.isTraining: # Important to have gradient updates as this operation is what # actually updates the parameters. output_feed = [self.updates, self.gradient_norms, self.losses] else: # Evaluation output_feed = [self.outputs] outputs = sess.run(output_feed, input_feed) if self.isTraining: return outputs[1], outputs[2] else: return outputs[0] def get_batch(self, data, bucket_id=None): """Prepare minibatch from given data. Args: data: A list of datapoints (all from same bucket). bucket_id: Bucket ID of data. This is irrevelant for training but for evaluation we can limit the padding by the bucket size. Returns: Batched input IDs, input sequence length, output IDs & output sequence length """ if not self.isTraining: # During evaluation the bucket size limits the amount of padding _, decoder_size = self.buckets[bucket_id] encoder_inputs, decoder_inputs = [], [] batch_size = len(data) seq_len = np.zeros((batch_size), dtype=np.int64) seq_len_target = np.zeros((batch_size), dtype=np.int64) for i, sample in enumerate(data): encoder_input, decoder_input = sample seq_len[i] = len(encoder_input) if not self.isTraining: seq_len_target[i] = decoder_size else: # 1 is added to output sequence length because the EOS token is # crucial to "halt" the decoder. Consider it the punctuation # mark of a English sentence. Both are necessary. seq_len_target[i] = len(decoder_input) + 1 # Maximum input and output length which limit the padding till them max_len_source = max(seq_len) max_len_target = max(seq_len_target) for i, sample in enumerate(data): encoder_input, decoder_input = sample # Encoder inputs are padded and then reversed. encoder_pad_size = max_len_source - len(encoder_input) encoder_pad = [data_utils.PAD_ID] * encoder_pad_size # Encoder input is reversed - https://arxiv.org/abs/1409.3215 encoder_inputs.append(list(reversed(encoder_input)) + encoder_pad) # 1 is added to decoder_input because GO_ID is considered a part of # decoder input. While EOS_ID is also added, it's really used by # the target tensor (self.tensor) in the core code above. decoder_pad_size = max_len_target - (len(decoder_input) + 1) decoder_inputs.append([data_utils.GO_ID] + decoder_input + [data_utils.EOS_ID] + [data_utils.PAD_ID] * decoder_pad_size) # Both the id sequences are made time major via transpose encoder_inputs = np.asarray(encoder_inputs, dtype=np.int32).T decoder_inputs = np.asarray(decoder_inputs, dtype=np.int32).T return encoder_inputs, seq_len, decoder_inputs, seq_len_target
def process_args(options): """Process arguments.""" def get_train_dir(options): """Get train directory name given the options.""" num_layer_string = "" for task in options['tasks']: if task == "char": continue num_layer_string += task + "_" + str( options['num_layers_' + task]) + "_" skip_string = "" if options['skip_step'] != 1: skip_string = "skip_" + str(options['skip_step']) + "_" train_dir = (skip_string + num_layer_string + ('lstm_' if options['use_lstm'] else '') + (('stack_' + str(options['stack_cons']) + "_") if options['stack_cons'] > 1 else '') + (('base_stride_' + str(options['initial_res_fac']) + "_") if options['initial_res_fac'] > 1 else '') + (('char_dec_dep_' + str(options['num_layers_dec']) + '_') if options['num_layers_dec'] > 1 else '') + ('lm_prob_' + str(options['lm_prob']) + '_') + 'run_id_' + str(options['run_id']) + ('_avg_' if options['avg'] else '')) return train_dir def parse_tasks(task_string): tasks = ["char"] if "p" in task_string: tasks.append("phone") return tasks options['tasks'] = parse_tasks(options['tasks']) train_dir = get_train_dir(options) options['train_dir'] = os.path.join(options['train_base_dir'], train_dir) options['best_model_dir'] = os.path.join( os.path.join(options['train_base_dir'], "best_models"), train_dir) for key_prefix in ['num_layers', 'max_output']: comb_dict = {} for task in options['tasks']: comb_dict[task] = options[key_prefix + "_" + task] options[key_prefix] = comb_dict options['vocab_size'] = {} for task in options['tasks']: target_vocab, _ = data_utils.initialize_vocabulary( os.path.join(options['vocab_dir'], task + ".vocab")) options['vocab_size'][task] = len(target_vocab) # Process training/eval params train_params = Train.get_updated_params(options) # Process beam search params beam_search_params = BeamSearch.get_updated_params(options) # Process model params encoder_params = Encoder.get_updated_params(options) decoder_params_base = AttnDecoder.get_updated_params(options) decoder_params = {} for task in options['tasks']: task_params = copy.deepcopy(decoder_params_base) task_params.vocab_size = options['vocab_size'][task] task_params.max_output = options['max_output'][task] if task is not "char": # Only make the char model deep task_params.num_layers_dec = 1 decoder_params[task] = task_params seq2seq_params = Seq2SeqModel.get_updated_params(options) seq2seq_params.encoder_params = encoder_params seq2seq_params.decoder_params = decoder_params lm_params = LMModel.get_updated_params(options) lm_enc_params = LMEncoder.get_updated_params(options) train_params.lm_params = lm_params train_params.lm_enc_params = lm_enc_params if not options['test'] and not options['dev']: if not os.path.exists(options['train_dir']): os.makedirs(options['train_dir']) os.makedirs(options['best_model_dir']) # Sort the options to create a parameter file parameter_file = 'parameters.txt' sorted_args = sorted(options.items(), key=operator.itemgetter(0)) with open(os.path.join(options['train_dir'], parameter_file), 'w') as g: for arg, arg_val in sorted_args: sys.stdout.write(arg + "\t" + str(arg_val) + "\n") sys.stdout.flush() g.write(arg + "\t" + str(arg_val) + "\n") proc_options = Bunch() proc_options.train_params = train_params proc_options.beam_search_params = beam_search_params proc_options.seq2seq_params = seq2seq_params proc_options.dev = options['dev'] proc_options.test = options['test'] return proc_options