def __init__(self):
        self.data = PrepareData()
        self.dataset = Seq2SeqDataset()
        self.data_loader = DataLoader(dataset=self.dataset,
                                      batch_size=1,
                                      shuffle=True)
        self.lang_1 = data.lang_1
        self.lang_2 = data.lang_2
        self.char2index = data.char2index
        self.index2char = data.index2char

        self.input_size = 100
        self.hidden_size = 64
        self.output_size = 100
        self.learning_rate = 0.01
        self.num_epoch = 500
        self.teacher_forcing = True
        self.use_cuda = torch.cuda.is_available()
        self.device = 'cuda:0' if self.use_cuda else 'cpu'

        self.encoder = EncoderRNN(input_size=self.input_size, hidden_size=self.hidden_size)
        self.decoder = DecoderRNN(output_size=self.output_size, hidden_size=self.hidden_size)
        self.attn_decoder = AttnDecoder(self.hidden_size, self.output_size)

        if use_cuda:
            self.encoder = self.encoder.to(self.device)
            self.decoder = self.decoder.to(self.device)

        self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), lr=self.learning_rate)
        self.decoder_optimizer = torch.optim.Adam(self.decoder.parameters(), lr=self.learning_rate)

        self.loss_function = nn.NLLLoss()
Example #2
0
def parse_options():
    parser = argparse.ArgumentParser()

    Train.add_parse_options(parser)
    Encoder.add_parse_options(parser)
    AttnDecoder.add_parse_options(parser)
    Seq2SeqModel.add_parse_options(parser)
    LMModel.add_parse_options(parser)
    BeamSearch.add_parse_options(parser)

    parser.add_argument("-dev",
                        default=False,
                        action="store_true",
                        help="Get dev set results using the last saved model")
    parser.add_argument("-test",
                        default=False,
                        action="store_true",
                        help="Get test results using the last saved model")
    args = parser.parse_args()
    args = vars(args)
    return process_args(args)
Example #3
0
    def class_params(cls):
        params = Bunch()
        # Task specification
        params['tasks'] = ['char']
        params['num_layers'] = {'char': 4}
        params['max_output'] = {'char': 120}

        # Optimization params
        params['learning_rate'] = 1e-3
        params['learning_rate_decay_factor'] = 0.5
        params['max_gradient_norm'] = 5.0

        # Loss params
        params['avg'] = True

        params['encoder_params'] = Encoder.class_params()
        params['decoder_params'] = {'char': AttnDecoder.class_params()}

        return params
Example #4
0
    def __init__(self, data_iter, isTraining=True, params=None):
        """Initializer of class that defines the computational graph.

        Args:
            encoder: Encoder object executed via encoder(args)
            decoder: Decoder object executed via decoder(args)
        """
        if params is None:
            self.params = self.class_params()
        else:
            self.params = params
        params = self.params

        self.encoder = Encoder(isTraining=isTraining,
                               params=params.encoder_params)
        self.decoder = {}
        for task in params.tasks:
            self.decoder[task] = AttnDecoder(
                isTraining=isTraining,
                params=params.decoder_params[task],
                scope=task)
        self.data_iter = data_iter

        self.isTraining = isTraining

        self.learning_rate = tf.Variable(float(params.learning_rate),
                                         trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * params.learning_rate_decay_factor)

        # Number of gradient updates performed
        self.global_step = tf.Variable(0, trainable=False)
        # Number of epochs done
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_incr = self.epoch.assign(self.epoch + 1)

        self.create_computational_graph()
Example #5
0
    def __init__(self, buckets, isTraining, max_gradient_norm, batch_size,
                 learning_rate, learning_rate_decay_factor, encoder_attribs,
                 decoder_attribs):
        """Initializer of class that defines the computational graph.

        Args:
            buckets: List of input-output sizes that limit the amount of
                sequence padding (http://goo.gl/d8ybpl).
            isTraining: boolean that denotes training v/s evaluation.
            max_gradient_norm: Maximum value of gradient norm.
            batch_size: Minibatch size used for doing SGD.
            learning_rate: Initial learning rate of optimizer
            learning_rate_decay_factor: Multiplicative learning rate decay
                factor
            {encoder, decoder}_attribs: Dictionary containing attributes for
                {encoder, decoder} RNN.
        """
        self.buckets = buckets
        self.isTraining = isTraining
        self.batch_size = batch_size

        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)

        # Number of gradient updates performed
        self.global_step = tf.Variable(0, trainable=False)
        # Number of epochs done
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_incr = self.epoch.assign(self.epoch + 1)

        # Placeholder for encoder input IDs - Shape TxB
        self.encoder_inputs = tf.placeholder(tf.int32,
                                             shape=[None, None],
                                             name='encoder')
        _batch_size = self.encoder_inputs.get_shape()[1].value
        # Input sequence length placeholder
        self.seq_len = tf.placeholder(tf.int32,
                                      shape=[_batch_size],
                                      name="seq_len")
        # Output sequence length placeholder
        self.seq_len_target = tf.placeholder(tf.int32,
                                             shape=[_batch_size],
                                             name="seq_len_target")

        # Input to decoder RNN. This input has an initial extra symbol - GO -
        # that initiates the decoding process.
        self.decoder_inputs = tf.placeholder(tf.int32,
                                             shape=[None, None],
                                             name="decoder")
        # Targets are decoder inputs shifted by one thus, ignoring GO symbol
        self.targets = tf.slice(self.decoder_inputs, [1, 0], [-1, -1])

        # Initialize the encoder and decoder RNNs
        self.encoder = Encoder(isTraining, **encoder_attribs)
        if decoder_attribs['simp_decoder']:
            self.decoder = SimpleDecoder(isTraining, **decoder_attribs)
        else:
            self.decoder = AttnDecoder(isTraining, **decoder_attribs)
        # First encode input
        self.encoder_hidden_states, self.final_state = \
            self.encoder.encode_input(self.encoder_inputs, self.seq_len)
        # Then decode
        self.outputs = \
            self.decoder.decode(self.decoder_inputs, self.seq_len_target,
                                self.encoder_hidden_states, self.final_state,
                                self.seq_len)
        # Training outputs and losses.
        self.losses = self.seq2seq_loss(self.outputs, self.targets,
                                        self.seq_len_target)

        if isTraining:
            # Gradients and parameter updation for training the model.
            params = tf.trainable_variables()
            print("\nModel parameters:\n")
            for var in params:
                print(("{0}: {1}").format(var.name, var.get_shape()))
            print
            # Initialize optimizer
            opt = tf.train.AdamOptimizer(self.learning_rate)
            # Get gradients from loss
            gradients = tf.gradients(self.losses, params)
            # Clip the gradients to avoid the problem of gradient explosion
            # possible early in training
            clipped_gradients, norm = tf.clip_by_global_norm(
                gradients, max_gradient_norm)
            self.gradient_norms = norm
            # Apply gradients
            self.updates = opt.apply_gradients(zip(clipped_gradients, params),
                                               global_step=self.global_step)

        # Model saver function
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)
Example #6
0
class Seq2SeqModel(object):
    """Implements the Encoder-Decoder model."""
    def __init__(self, buckets, isTraining, max_gradient_norm, batch_size,
                 learning_rate, learning_rate_decay_factor, encoder_attribs,
                 decoder_attribs):
        """Initializer of class that defines the computational graph.

        Args:
            buckets: List of input-output sizes that limit the amount of
                sequence padding (http://goo.gl/d8ybpl).
            isTraining: boolean that denotes training v/s evaluation.
            max_gradient_norm: Maximum value of gradient norm.
            batch_size: Minibatch size used for doing SGD.
            learning_rate: Initial learning rate of optimizer
            learning_rate_decay_factor: Multiplicative learning rate decay
                factor
            {encoder, decoder}_attribs: Dictionary containing attributes for
                {encoder, decoder} RNN.
        """
        self.buckets = buckets
        self.isTraining = isTraining
        self.batch_size = batch_size

        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)

        # Number of gradient updates performed
        self.global_step = tf.Variable(0, trainable=False)
        # Number of epochs done
        self.epoch = tf.Variable(0, trainable=False)
        self.epoch_incr = self.epoch.assign(self.epoch + 1)

        # Placeholder for encoder input IDs - Shape TxB
        self.encoder_inputs = tf.placeholder(tf.int32,
                                             shape=[None, None],
                                             name='encoder')
        _batch_size = self.encoder_inputs.get_shape()[1].value
        # Input sequence length placeholder
        self.seq_len = tf.placeholder(tf.int32,
                                      shape=[_batch_size],
                                      name="seq_len")
        # Output sequence length placeholder
        self.seq_len_target = tf.placeholder(tf.int32,
                                             shape=[_batch_size],
                                             name="seq_len_target")

        # Input to decoder RNN. This input has an initial extra symbol - GO -
        # that initiates the decoding process.
        self.decoder_inputs = tf.placeholder(tf.int32,
                                             shape=[None, None],
                                             name="decoder")
        # Targets are decoder inputs shifted by one thus, ignoring GO symbol
        self.targets = tf.slice(self.decoder_inputs, [1, 0], [-1, -1])

        # Initialize the encoder and decoder RNNs
        self.encoder = Encoder(isTraining, **encoder_attribs)
        if decoder_attribs['simp_decoder']:
            self.decoder = SimpleDecoder(isTraining, **decoder_attribs)
        else:
            self.decoder = AttnDecoder(isTraining, **decoder_attribs)
        # First encode input
        self.encoder_hidden_states, self.final_state = \
            self.encoder.encode_input(self.encoder_inputs, self.seq_len)
        # Then decode
        self.outputs = \
            self.decoder.decode(self.decoder_inputs, self.seq_len_target,
                                self.encoder_hidden_states, self.final_state,
                                self.seq_len)
        # Training outputs and losses.
        self.losses = self.seq2seq_loss(self.outputs, self.targets,
                                        self.seq_len_target)

        if isTraining:
            # Gradients and parameter updation for training the model.
            params = tf.trainable_variables()
            print("\nModel parameters:\n")
            for var in params:
                print(("{0}: {1}").format(var.name, var.get_shape()))
            print
            # Initialize optimizer
            opt = tf.train.AdamOptimizer(self.learning_rate)
            # Get gradients from loss
            gradients = tf.gradients(self.losses, params)
            # Clip the gradients to avoid the problem of gradient explosion
            # possible early in training
            clipped_gradients, norm = tf.clip_by_global_norm(
                gradients, max_gradient_norm)
            self.gradient_norms = norm
            # Apply gradients
            self.updates = opt.apply_gradients(zip(clipped_gradients, params),
                                               global_step=self.global_step)

        # Model saver function
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=2)

    @staticmethod
    def seq2seq_loss(logits, targets, seq_len_target):
        """Calculate the cross entropy loss w.r.t. given target.

        Args:
            logits: A 2-d tensor of shape (TxB)x|V| containing the logit score
                per output symbol.
            targets: 2-d tensor of shape TxB that contains the ground truth
                output symbols.
            seq_len_target: Sequence length of output sequences. Required to
                mask padding symbols in output sequences.
        """
        with ops.name_scope("sequence_loss", [logits, targets]):
            flat_targets = tf.reshape(targets, [-1])
            cost = nn_ops.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=flat_targets)

            # Mask this cost since the output sequence is padded
            batch_major_mask = tf.sequence_mask(seq_len_target,
                                                dtype=tf.float32)
            time_major_mask = tf.transpose(batch_major_mask, [1, 0])
            weights = tf.reshape(time_major_mask, [-1])
            mask_cost = weights * cost

            loss = tf.reshape(mask_cost, tf.shape(targets))
            # Average the loss for each example by the # of timesteps
            cost_per_example = tf.reduce_sum(loss, reduction_indices=0) /\
                tf.cast(seq_len_target, tf.float32)
            # Return the average cost over all examples
            return tf.reduce_mean(cost_per_example)

    def step(self, sess, encoder_inputs, seq_len, decoder_inputs,
             seq_len_target):
        """Perform 1 minibatch update/evaluation.

        Args:
            sess: Tensorflow session where computation graph is created
            encoder_inputs: List of a minibatch of input IDs
            seq_len: Input sequence length
            decoder_inputs: List of a minibatch of output IDs
            seq_len_target: Output sequence length
        Returns:
            Output of a minibatch updated. The exact output depends on
            whether the model is in training mode or evaluation mode.
        """
        # Pass inputs via feed dict method
        input_feed = {}
        input_feed[self.encoder_inputs.name] = encoder_inputs
        input_feed[self.decoder_inputs.name] = decoder_inputs
        input_feed[self.seq_len.name] = seq_len
        input_feed[self.seq_len_target.name] = seq_len_target

        if self.isTraining:
            # Important to have gradient updates as this operation is what
            # actually updates the parameters.
            output_feed = [self.updates, self.gradient_norms, self.losses]
        else:
            # Evaluation
            output_feed = [self.outputs]

        outputs = sess.run(output_feed, input_feed)
        if self.isTraining:
            return outputs[1], outputs[2]
        else:
            return outputs[0]

    def get_batch(self, data, bucket_id=None):
        """Prepare minibatch from given data.

        Args:
            data: A list of datapoints (all from same bucket).
            bucket_id: Bucket ID of data. This is irrevelant for training but
                for evaluation we can limit the padding by the bucket size.
        Returns:
            Batched input IDs, input sequence length, output IDs & output
            sequence length
        """
        if not self.isTraining:
            # During evaluation the bucket size limits the amount of padding
            _, decoder_size = self.buckets[bucket_id]

        encoder_inputs, decoder_inputs = [], []
        batch_size = len(data)

        seq_len = np.zeros((batch_size), dtype=np.int64)
        seq_len_target = np.zeros((batch_size), dtype=np.int64)

        for i, sample in enumerate(data):
            encoder_input, decoder_input = sample
            seq_len[i] = len(encoder_input)
            if not self.isTraining:
                seq_len_target[i] = decoder_size
            else:
                # 1 is added to output sequence length because the EOS token is
                # crucial to "halt" the decoder. Consider it the punctuation
                # mark of a English sentence. Both are necessary.
                seq_len_target[i] = len(decoder_input) + 1

        # Maximum input and output length which limit the padding till them
        max_len_source = max(seq_len)
        max_len_target = max(seq_len_target)

        for i, sample in enumerate(data):
            encoder_input, decoder_input = sample
            # Encoder inputs are padded and then reversed.
            encoder_pad_size = max_len_source - len(encoder_input)
            encoder_pad = [data_utils.PAD_ID] * encoder_pad_size
            # Encoder input is reversed - https://arxiv.org/abs/1409.3215
            encoder_inputs.append(list(reversed(encoder_input)) + encoder_pad)

            # 1 is added to decoder_input because GO_ID is considered a part of
            # decoder input. While EOS_ID is also added, it's really used by
            # the target tensor (self.tensor) in the core code above.
            decoder_pad_size = max_len_target - (len(decoder_input) + 1)
            decoder_inputs.append([data_utils.GO_ID] + decoder_input +
                                  [data_utils.EOS_ID] +
                                  [data_utils.PAD_ID] * decoder_pad_size)

        # Both the id sequences are made time major via transpose
        encoder_inputs = np.asarray(encoder_inputs, dtype=np.int32).T
        decoder_inputs = np.asarray(decoder_inputs, dtype=np.int32).T

        return encoder_inputs, seq_len, decoder_inputs, seq_len_target
Example #7
0
def process_args(options):
    """Process arguments."""
    def get_train_dir(options):
        """Get train directory name given the options."""
        num_layer_string = ""
        for task in options['tasks']:
            if task == "char":
                continue
            num_layer_string += task + "_" + str(
                options['num_layers_' + task]) + "_"

        skip_string = ""
        if options['skip_step'] != 1:
            skip_string = "skip_" + str(options['skip_step']) + "_"

        train_dir = (skip_string + num_layer_string +
                     ('lstm_' if options['use_lstm'] else '') +
                     (('stack_' + str(options['stack_cons']) +
                       "_") if options['stack_cons'] > 1 else '') +
                     (('base_stride_' + str(options['initial_res_fac']) +
                       "_") if options['initial_res_fac'] > 1 else '') +
                     (('char_dec_dep_' + str(options['num_layers_dec']) +
                       '_') if options['num_layers_dec'] > 1 else '') +
                     ('lm_prob_' + str(options['lm_prob']) + '_') + 'run_id_' +
                     str(options['run_id']) +
                     ('_avg_' if options['avg'] else ''))
        return train_dir

    def parse_tasks(task_string):
        tasks = ["char"]
        if "p" in task_string:
            tasks.append("phone")
        return tasks

    options['tasks'] = parse_tasks(options['tasks'])

    train_dir = get_train_dir(options)
    options['train_dir'] = os.path.join(options['train_base_dir'], train_dir)
    options['best_model_dir'] = os.path.join(
        os.path.join(options['train_base_dir'], "best_models"), train_dir)

    for key_prefix in ['num_layers', 'max_output']:
        comb_dict = {}
        for task in options['tasks']:
            comb_dict[task] = options[key_prefix + "_" + task]
        options[key_prefix] = comb_dict

    options['vocab_size'] = {}
    for task in options['tasks']:
        target_vocab, _ = data_utils.initialize_vocabulary(
            os.path.join(options['vocab_dir'], task + ".vocab"))

        options['vocab_size'][task] = len(target_vocab)

    # Process training/eval params
    train_params = Train.get_updated_params(options)
    # Process beam search params
    beam_search_params = BeamSearch.get_updated_params(options)
    # Process model params
    encoder_params = Encoder.get_updated_params(options)
    decoder_params_base = AttnDecoder.get_updated_params(options)
    decoder_params = {}
    for task in options['tasks']:
        task_params = copy.deepcopy(decoder_params_base)
        task_params.vocab_size = options['vocab_size'][task]
        task_params.max_output = options['max_output'][task]
        if task is not "char":
            # Only make the char model deep
            task_params.num_layers_dec = 1

        decoder_params[task] = task_params

    seq2seq_params = Seq2SeqModel.get_updated_params(options)
    seq2seq_params.encoder_params = encoder_params
    seq2seq_params.decoder_params = decoder_params

    lm_params = LMModel.get_updated_params(options)
    lm_enc_params = LMEncoder.get_updated_params(options)
    train_params.lm_params = lm_params
    train_params.lm_enc_params = lm_enc_params

    if not options['test'] and not options['dev']:
        if not os.path.exists(options['train_dir']):
            os.makedirs(options['train_dir'])
            os.makedirs(options['best_model_dir'])

        # Sort the options to create a parameter file
        parameter_file = 'parameters.txt'
        sorted_args = sorted(options.items(), key=operator.itemgetter(0))

        with open(os.path.join(options['train_dir'], parameter_file),
                  'w') as g:
            for arg, arg_val in sorted_args:
                sys.stdout.write(arg + "\t" + str(arg_val) + "\n")
                sys.stdout.flush()
                g.write(arg + "\t" + str(arg_val) + "\n")

    proc_options = Bunch()
    proc_options.train_params = train_params
    proc_options.beam_search_params = beam_search_params
    proc_options.seq2seq_params = seq2seq_params
    proc_options.dev = options['dev']
    proc_options.test = options['test']

    return proc_options