def __init__(self, source_vocab_size, target_vocab_size, buckets, text_hidden_size, speech_hidden_size, parse_hidden_size, text_num_layers, speech_num_layers, parse_num_layers, filter_sizes, num_filters, feat_dim, fixed_word_length, embedding_size, max_gradient_norm, batch_size, attn_vec_size, spscale, learning_rate, learning_rate_decay_factor, optimizer, use_lstm=True, output_keep_prob=0.8, num_samples=512, forward_only=False): """Create the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.spscale = spscale self.epoch = 0 self.feat_dim = feat_dim self.fixed_word_length = fixed_word_length self.filter_sizes = filter_sizes self.num_filters = num_filters self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w = tf.get_variable("proj_w", [hidden_size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. def create_cell(hidden_size, num_layers): single_cell = rnn_cell.GRUCell(hidden_size) if use_lstm: print("Using LSTM") single_cell = rnn_cell.BasicLSTMCell(hidden_size, state_is_tuple=True) #single_cell = rnn_cell.BasicLSTMCell(hidden_size) if not forward_only: # always use dropout; set keep_prob=1 if not dropout print("Training mode; dropout used!") single_cell = rnn_cell.DropoutWrapper(single_cell, output_keep_prob=output_keep_prob) cell = single_cell if num_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers, state_is_tuple=True) #cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) return cell text_cell = create_cell(text_hidden_size, text_num_layers) speech_cell = create_cell(speech_hidden_size, speech_num_layers) parse_cell = create_cell(parse_hidden_size, parse_num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs_list, decoder_inputs, text_len, speech_len, do_decode, attn_vec_size): return many2one_seq2seq.many2one_attention_seq2seq( encoder_inputs_list, decoder_inputs, text_len, speech_len, feat_dim, text_cell, speech_cell, parse_cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=embedding_size, attention_vec_size=attn_vec_size, fixed_word_length=fixed_word_length, filter_sizes=filter_sizes, num_filters=num_filters, output_projection=output_projection, feed_previous=do_decode) # Feeds for inputs. #self.encoder_inputs = [] self.text_encoder_inputs = [] self.speech_encoder_inputs = [] self.speech_partitions = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.text_encoder_inputs.append(tf.placeholder(tf.int32, shape=[None],name="text_encoder{0}".format(i))) for i in xrange(buckets[-1][0]*self.spscale): self.speech_encoder_inputs.append(tf.placeholder(tf.float32, shape=[None, fixed_word_length, feat_dim],name="speech_encoder{0}".format(i))) for i in xrange(buckets[-1][1]+1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) self.encoder_inputs_list = [self.text_encoder_inputs, self.speech_encoder_inputs] # seq_len stuff: _batch_size = tf.shape(self.text_encoder_inputs[0])[0] # the constant "2" is just a placeholder self.text_seq_len = tf.fill(tf.expand_dims(_batch_size, 0), tf.constant(2, dtype=tf.int64)) self.speech_seq_len = tf.fill(tf.expand_dims(_batch_size, 0), tf.constant(2, dtype=tf.int64)) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i+1] for i in xrange(len(self.decoder_inputs)-1)] # Training outputs and losses. if forward_only: self.outputs, self.losses = many2one_seq2seq.many2one_model_with_buckets( self.encoder_inputs_list, self.decoder_inputs, targets, self.target_weights, self.text_seq_len, self.speech_seq_len, buckets, lambda x, y, z, w: seq2seq_f(x, y, z, w, True, attn_vec_size), softmax_loss_function=softmax_loss_function, spscale=self.spscale) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = many2one_seq2seq.many2one_model_with_buckets( self.encoder_inputs_list, self.decoder_inputs, targets, self.target_weights, self.text_seq_len, self.speech_seq_len, buckets, lambda x, y, z, w: seq2seq_f(x, y, z, w, False, attn_vec_size), softmax_loss_function=softmax_loss_function, spscale=self.spscale) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] #opt = tf.train.AdagradOptimizer(self.learning_rate) ## Make optimizer a hyperparameter if optimizer == "momentum": opt = tf.train.MomentumOptimizer(self.learning_rate, 0.9) elif optimizer == "grad_descent": opt = tf.train.GradientDescentOptimizer(self.learning_rate) elif optimizer == "adagrad": print("Using adagrad optimizer") opt = tf.train.AdagradOptimizer(self.learning_rate) else: print("Using Adam optimizer") opt = tf.train.AdamOptimizer(self.learning_rate) for b in xrange(len(buckets)): # add gradient aggregration trick for less memory gradients = tf.gradients(self.losses[b], params, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N) clipped_gradients, norm = tf.clip_by_global_norm(gradients,max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) #self.saver = tf.train.Saver(tf.all_variables()) self.saver = tf.train.Saver(tf.global_variables())
def __init__(self, source_vocab_size, target_vocab_size, buckets, hidden_size, num_layers, embedding_size, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, output_keep_prob=0.8, num_samples=512, forward_only=False, dropout=True): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable(float(learning_rate), trainable=False) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w = tf.get_variable("proj_w", [hidden_size, self.target_vocab_size]) w_t = tf.transpose(w) b = tf.get_variable("proj_b", [self.target_vocab_size]) output_projection = (w, b) def sampled_loss(inputs, labels): labels = tf.reshape(labels, [-1, 1]) return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, self.target_vocab_size) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. single_cell = tf.nn.rnn_cell.GRUCell(hidden_size) if use_lstm: single_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size) if dropout and not forward_only: print("Training mode; dropout used!") single_cell = tf.nn.rnn_cell.DropoutWrapper( single_cell, output_keep_prob=output_keep_prob) cell = single_cell if num_layers > 1: cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs_list, decoder_inputs, text_len, do_decode, attn_vec_size): return many2one_seq2seq.many2one_attention_seq2seq( encoder_inputs_list, decoder_inputs, text_len, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=embedding_size, output_projection=output_projection, feed_previous=do_decode, attention_vec_size=attn_vec_size) # Feeds for inputs. #self.encoder_inputs = [] self.text_encoder_inputs = [] self.speech_encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.text_encoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="text_encoder{0}".format(i))) for i in xrange(buckets[-1][0] * spscale): self.speech_encoder_inputs.append( tf.placeholder(tf.float32, shape=[None, mfcc_num], name="speech_encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append( tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append( tf.placeholder(tf.float32, shape=[None], name="weight{0}".format(i))) self.encoder_inputs_list = [ self.text_encoder_inputs, self.speech_encoder_inputs ] # seq_len stuff: _batch_size = tf.shape(self.text_encoder_inputs[0])[0] self.seq_len = tf.fill(tf.expand_dims(_batch_size, 0), tf.constant(2, dtype=tf.int64)) # Our targets are decoder inputs shifted by one. targets = [ self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1) ] # Training outputs and losses. if forward_only: self.outputs, self.losses = many2one_seq2seq.many2one_model_with_buckets( self.encoder_inputs_list, self.decoder_inputs, targets, self.target_weights, self.seq_len, buckets, lambda x, y, z: seq2seq_f(x, y, z, True, attn_vec_size), softmax_loss_function=softmax_loss_function, spscale=spscale) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = many2one_seq2seq.many2one_model_with_buckets( self.encoder_inputs_list, self.decoder_inputs, targets, self.target_weights, self.seq_len, buckets, lambda x, y, z: seq2seq_f(x, y, z, False, attn_vec_size), softmax_loss_function=softmax_loss_function, spscale=spscale) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] #opt = tf.train.GradientDescentOptimizer(self.learning_rate) opt = tf.train.AdagradOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append( opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.all_variables())