def _apply_transposed(self, is_train, x): w_init = get_keras_initialization(self.w_init) r_init = None if self.recurrent_init is None else get_keras_initialization( self.recurrent_init) x_size = x.shape.as_list()[-1] if x_size is None: raise ValueError("Last dimension must be defined (have shape %s)" % str(x.shape)) if self._kind == "GRU": cell = cudnn_rnn_ops.CudnnGRU(self.n_layers, self.n_units, x_size, input_mode="linear_input") elif self._kind == "LSTM": cell = cudnn_rnn_ops.CudnnLSTM(self.n_layers, self.n_units, x_size, input_mode="linear_input") else: raise ValueError() n_params = cell.params_size().eval() weights, biases = cell.params_to_canonical(tf.zeros([n_params])) def init(shape, dtype=None, partition_info=None): # This a bit hacky, since the api for these models is akward. We have to compute the shape of # the weights / biases by calling `cell.params_to_canonical` with a unused tensor, and then # use .eval() to actually get the shape. Then we can apply the user-requested initialzers if self._kind == "LSTM": is_recurrent = [ False, False, False, False, True, True, True, True ] is_forget_bias = [ False, True, False, False, False, True, False, False ] else: is_recurrent = [False, False, False, True, True, True] is_forget_bias = [False] * 6 init_biases = [ tf.constant(self.lstm_bias / 2.0, tf.float32, (self.n_units, )) if z else tf.zeros(self.n_units) for z in is_forget_bias ] init_weights = [] for w, r in zip(weights, is_recurrent): if r and r_init is not None: init_weights.append( tf.reshape( r_init((self.n_units, self.n_units), w.dtype), tf.shape(w))) else: init_weights.append(w_init(tf.shape(w).eval(), w.dtype)) out = cell.canonical_to_params(init_weights, init_biases) out.set_shape((n_params, )) return out parameters = tf.get_variable("gru_parameters", n_params, tf.float32, initializer=init) if self.keep_recurrent < 1: # Not super well test, try to figure out which indices in `parameters` are recurrent weights and drop them # this is implementing drop-connect for the recurrent weights is_recurrent = weights[:len(weights) // 2] + [ tf.ones_like(w) for w in weights[len(weights) // 2:] ] recurrent_mask = cell.canonical_to_params( is_recurrent, biases) # ones at recurrent weights recurrent_mask = 1 - recurrent_mask * ( 1 - self.keep_recurrent ) # ones are non-recurrent param, keep_prob elsewhere parameters = tf.cond( is_train, lambda: tf.floor( tf.random_uniform( (n_params, )) + recurrent_mask) * parameters, lambda: parameters) if self._kind == "LSTM": if self.learn_initial_states: raise NotImplementedError() else: initial_state_h = tf.zeros( (self.n_layers, tf.shape(x)[1], self.n_units), tf.float32) initial_state_c = tf.zeros( (self.n_layers, tf.shape(x)[1], self.n_units), tf.float32) out = cell(x, initial_state_h, initial_state_c, parameters, True) else: if self.learn_initial_states: initial_state = tf.get_variable("initial_state", self.n_units, tf.float32, tf.zeros_initializer()) initial_state = tf.tile( tf.expand_dims(tf.expand_dims(initial_state, 0), 0), [self.n_layers, tf.shape(x)[1], 1]) else: initial_state = tf.zeros( (self.n_layers, tf.shape(x)[1], self.n_units), tf.float32) out = cell(x, initial_state, parameters, True) return out
def _add_encoder(self, inputs, sent_lens, doc_lens, transpose_output=False): hps = self._hps # Masking the word embeddings sent_lens_rsp = tf.reshape(sent_lens, [-1]) # [batch_size * num_sentences] word_masks = tf.expand_dims( tf.sequence_mask(sent_lens_rsp, maxlen=hps.num_words_sent, dtype=tf.float32), 2) # [batch_size * num_sentences, num_words_sent, 1] inputs_rsp = tf.reshape(inputs, [-1, hps.num_words_sent]) emb_inputs = tf.nn.embedding_lookup( self._input_embed, inputs_rsp ) # [batch_size * num_sentences, num_words_sent, emb_size] emb_inputs = emb_inputs * word_masks # Level 1: Add the word-level convolutional neural network word_conv_outputs = [] for k_size in hps.word_conv_k_sizes: # Create CNNs with different kernel width word_conv_k = tf.layers.conv1d( emb_inputs, hps.word_conv_filter, (k_size, ), padding="same", kernel_initializer=tf.random_uniform_initializer(-0.1, 0.1)) mean_pool_sent = tf.reduce_mean( word_conv_k, axis=1) # [batch_size * num_sentences, word_conv_filter] word_conv_outputs.append(mean_pool_sent) word_conv_output = tf.concat( word_conv_outputs, axis=1) # concat the sentence representations # Reshape the representations of sentences sentence_size = len(hps.word_conv_k_sizes) * hps.word_conv_filter sentence_repr = tf.reshape( word_conv_output, [-1, hps.num_sentences, sentence_size ]) # [batch_size, num_sentences, sentence_size] # Level 2: Add the sentence-level RNN enc_model = cudnn_rnn_ops.CudnnGRU(hps.enc_layers, hps.enc_num_hidden, sentence_size, direction="bidirectional", dropout=hps.dropout) # Compute the total size of RNN params (Tensor) params_size_ts = enc_model.params_size() params = tf.Variable(tf.random_uniform([params_size_ts], minval=-0.1, maxval=0.1), validate_shape=False, name="encoder_cudnn_gru_var") batch_size_ts = tf.shape(inputs)[0] # batch size Tensor init_state = tf.zeros(tf.stack([2, batch_size_ts, hps.enc_num_hidden])) # init_c = tf.zeros(tf.stack([2, batch_size_ts, hps.enc_num_hidden])) # Call the CudnnGRU sentence_vecs_t = tf.transpose(sentence_repr, [1, 0, 2]) sent_rnn_output, _ = enc_model( input_data=sentence_vecs_t, input_h=init_state, params=params) # [num_sentences, batch_size, enc_num_hidden*2] # Masking the paddings sent_out_masks = tf.sequence_mask( doc_lens, hps.num_sentences, tf.float32) # [batch_size, num_sentences] sent_out_masks = tf.expand_dims(tf.transpose(sent_out_masks), 2) # [num_sentences, batch_size, 1] sent_rnn_output = sent_rnn_output * sent_out_masks # [num_sentences, batch_size, enc_num_hidden*2] if transpose_output: sent_rnn_output = tf.transpose( sent_rnn_output, [1, 0, 2]) # [batch_size, num_sentences, enc_num_hidden*2] return sent_rnn_output
def convert(model_dir, output_dir, best_weights=False): print("Load model") md = ModelDir(model_dir) model = md.get_model() dim = model.embed_mapper.layers[1].n_units global_step = tf.get_variable('global_step', shape=[], dtype='int32', initializer=tf.constant_initializer(0), trainable=False) print("Setting up cudnn version") # global_step = tf.get_variable('global_step', shape=[], dtype='int32', trainable=False) sess = tf.Session() with sess.as_default(): model.set_input_spec( ParagraphAndQuestionSpec(1, None, None, 14), {"the"}, ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)})) print("Buiding graph") pred = model.get_prediction() test_questions = ParagraphAndQuestion( ["Harry", "Potter", "was", "written", "by", "JK"], ["Who", "wrote", "Harry", "Potter", "?"], None, "test_questions") print("Load vars") md.restore_checkpoint(sess) feed = model.encode([test_questions], False) cuddn_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed) print("Done, copying files...") if not exists(output_dir): mkdir(output_dir) for file in listdir(model_dir): if isfile(file) and file != "model.npy": copyfile(join(model_dir, file), join(output_dir, file)) print("Done, mapping tensors...") to_save = [] to_init = [] for x in tf.trainable_variables(): if x.name.endswith("/gru_parameters:0"): key = x.name[:-len("/gru_parameters:0")] fw_params = x if "map_embed" in x.name: c = cudnn_rnn_ops.CudnnGRU(1, dim, 400) elif "chained-out" in x.name: c = cudnn_rnn_ops.CudnnGRU(1, dim, dim * 4) else: c = cudnn_rnn_ops.CudnnGRU(1, dim, dim * 2) params_saveable = cudnn_rnn_ops.RNNParamsSaveable( c, c.params_to_canonical, c.canonical_to_params, [fw_params], key) for spec in params_saveable.specs: if spec.name.endswith("bias_cudnn 0") or \ spec.name.endswith("bias_cudnn 1"): # ??? What do these even do? continue name = spec.name.split("/") name.remove("cell_0") if "forward" in name: ix = name.index("forward") name.insert(ix + 2, "fw") else: ix = name.index("backward") name.insert(ix + 2, "bw") del name[ix] ix = name.index("multi_rnn_cell") name[ix] = "bidirectional_rnn" name = "/".join(name) v = tf.Variable(sess.run(spec.tensor), name=name) to_init.append(v) to_save.append(v) else: to_save.append(x) other = [ x for x in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if x not in tf.trainable_variables() ] print(other) sess.run(tf.initialize_variables(to_init)) saver = tf.train.Saver(to_save + other) save_dir = join(output_dir, "save") if not exists(save_dir): mkdir(save_dir) saver.save(sess, join(save_dir, "checkpoint"), sess.run(global_step)) sess.close() tf.reset_default_graph() print("Updating model...") model.embed_mapper.layers = [ model.embed_mapper.layers[0], BiRecurrentMapper(CompatGruCellSpec(dim)) ] model.match_encoder.layers = list(model.match_encoder.layers) other = model.match_encoder.layers[1].other other.layers = list(other.layers) other.layers[1] = BiRecurrentMapper(CompatGruCellSpec(dim)) pred = model.predictor.predictor pred.first_layer = BiRecurrentMapper(CompatGruCellSpec(dim)) pred.second_layer = BiRecurrentMapper(CompatGruCellSpec(dim)) with open(join(output_dir, "model.pkl"), "wb") as f: pickle.dump(model, f) print("Testing...") with open(join(output_dir, "model.pkl"), "rb") as f: model = pickle.load(f) sess = tf.Session() model.set_input_spec( ParagraphAndQuestionSpec(1, None, None, 14), {"the"}, ResourceLoader(lambda a, b: {"the": np.zeros(300, np.float32)})) pred = model.get_prediction() print("Rebuilding") saver = tf.train.Saver() saver.restore(sess, tf.train.latest_checkpoint(save_dir)) feed = model.encode([test_questions], False) cpu_out = sess.run([pred.start_logits, pred.end_logits], feed_dict=feed) print("These should be close:") print([np.allclose(a, b) for a, b in zip(cpu_out, cuddn_out)]) print(cpu_out) print(cuddn_out)
def __init__(self, is_training, batch_size, num_unrollings, vocab_size, hidden_size, max_grad_norm, embedding_size, num_layers, learning_rate, model, dropout=0.0, input_dropout=0.0, use_batch=True): self.batch_size = batch_size self.num_unrollings = num_unrollings if not use_batch: self.batch_size = 1 self.num_unrollings = 1 self.hidden_size = hidden_size self.vocab_size = vocab_size self.max_grad_norm = max_grad_norm self.num_layers = num_layers self.embedding_size = embedding_size self.model = model self.dropout = dropout self.input_dropout = input_dropout if embedding_size <= 0: self.input_size = vocab_size # Don't do dropout on one hot representation. self.input_dropout = 0.0 else: self.input_size = embedding_size self.model_size = ( embedding_size * vocab_size + # embedding parameters # lstm parameters 4 * hidden_size * (hidden_size + self.input_size + 1) + # softmax parameters vocab_size * (hidden_size + 1) + # multilayer lstm parameters for extra layers. (num_layers - 1) * 4 * hidden_size * (hidden_size + hidden_size + 1)) # self.decay_rate = decay_rate # Placeholder to feed in input and targets/labels data. self.input_data = tf.placeholder( tf.int64, [self.batch_size, self.num_unrollings], name='inputs') self.targets = tf.placeholder(tf.int64, [self.batch_size, self.num_unrollings], name='targets') ################################################# #NEED TO REPLACE ALL CELL CODE # if self.model == 'rnn': # cell_fn = tf.contrib.rnn.BasicRNNCell # elif self.model == 'lstm': # cell_fn = tf.contrib.rnn.BasicLSTMCell # elif self.model == 'gru': # cell_fn = tf.contrib.rnn.GRUCell # # params = {'input_size': self.input_size} # params = {} # if self.model == 'lstm': # # add bias to forget gate in lstm. # params['forget_bias'] = 0.0 # params['state_is_tuple'] = True # # Create multilayer cell. # cell = cell_fn( # self.hidden_size, reuse=tf.get_variable_scope().reuse, # **params) # cells = [cell] # # params['input_size'] = self.hidden_size # # more explicit way to create cells for MultiRNNCell than # # [higher_layer_cell] * (self.num_layers - 1) # for i in range(self.num_layers-1): # higher_layer_cell = cell_fn( # self.hidden_size, reuse=tf.get_variable_scope().reuse, # **params) # cells.append(higher_layer_cell) # if is_training and self.dropout > 0: # cells = [tf.contrib.rnn.DropoutWrapper( # cell, # output_keep_prob=1.0-self.dropout) # for cell in cells] # multi_cell = tf.contrib.rnn.MultiRNNCell(cells) # with tf.name_scope('initial_state'): # # zero_state is used to compute the intial state for cell. # self.zero_state = multi_cell.zero_state(self.batch_size, tf.float32) # # Placeholder to feed in initial state. # # self.initial_state = tf.placeholder( # # tf.float32, # # [self.batch_size, multi_cell.state_size], # # 'initial_state') # self.initial_state = create_tuple_placeholders_with_default( # multi_cell.zero_state(batch_size, tf.float32), # extra_dims=(None,), # shape=multi_cell.state_size) ######## MIGHT NEED THIS STUFF ################## # Embeddings layers. with tf.name_scope('embedding_layer'): if embedding_size > 0: self.embedding = tf.get_variable( 'embedding', [self.vocab_size, self.embedding_size]) else: self.embedding = tf.constant(np.eye(self.vocab_size), dtype=tf.float32) inputs = tf.nn.embedding_lookup(self.embedding, self.input_data) if is_training and self.input_dropout > 0: inputs = tf.nn.dropout(inputs, 1 - self.input_dropout) with tf.name_scope('slice_inputs'): # Slice inputs into a list of shape [batch_size, 1] data colums. sliced_inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split(axis=1, num_or_size_splits=self.num_unrollings, value=inputs) ] # Copy cell to do unrolling and collect outputs. # outputs, final_state = tf.contrib.rnn.static_rnn( # multi_cell, sliced_inputs, # initial_state=self.initial_state) ######################## #Insert MIOPEN if self.model == 'lstm': model = cudnn_rnn_ops.CudnnLSTM(self.num_layers, self.hidden_size, self.embedding_size, dropout=self.dropout) elif self.model == 'gru': model = cudnn_rnn_ops.CudnnGRU(self.num_layers, self.hidden_size, self.embedding_size, dropout=self.dropout) elif self.model == 'rnn': model = cudnn_rnn_ops.CudnnRNNTanh(self.num_layers, self.hidden_size, self.embedding_size, dropout=self.dropout) else: raise ValueError("Invalid model: %s" % self.model) # Set zero init input states input_h = constant_op.constant(np.zeros( [self.num_layers, self.num_unrollings, self.hidden_size]), dtype=tf.float32) has_input_c = (self.model == 'lstm') if has_input_c: input_c = constant_op.constant(np.zeros( [self.num_layers, self.num_unrollings, self.hidden_size]), dtype=tf.float32) # Set rnn params params_size_t = model.params_size() rand_params = random_ops.random_uniform(params_size_t.shape) print "PARAMS size" print params_size_t print rand_params.shape print "Input sizes" print input_h print input_c print "Batch size" print batch_size print "Hidden size" print self.hidden_size #rand_params.set_shape(params_size_t.shape); params = variables.Variable(rand_params, validate_shape=True) args = { "input_data": inputs, "input_h": input_h, "params": params, "is_training": is_training } if has_input_c: args["input_c"] = input_c # Build cell if (self.model == 'lstm'): outputs, final_state, final_cell = model(input_data=inputs, input_h=input_h, input_c=input_c, params=params) else: outputs, final_state, final_cell = model(input_data=inputs, input_h=input_h, params=params) # model(**args) self.zero_state = state_ops.assign( params, array_ops.zeros(params_size_t.shape)) self.initial_state = create_tuple_placeholders_with_default( self.zero_state, extra_dims=(None, ), shape=params_size_t.shape) print "Initial State" print self.initial_state ######################## self.final_state = final_state with tf.name_scope('flatten_ouputs'): # Flatten the outputs into one dimension. flat_outputs = tf.reshape(tf.concat(axis=1, values=outputs), [-1, hidden_size]) with tf.name_scope('flatten_targets'): # Flatten the targets too. flat_targets = tf.reshape(tf.concat(axis=1, values=self.targets), [-1]) # Create softmax parameters, weights and bias. with tf.variable_scope('softmax') as sm_vs: softmax_w = tf.get_variable("softmax_w", [hidden_size, vocab_size]) softmax_b = tf.get_variable("softmax_b", [vocab_size]) self.logits = tf.matmul(flat_outputs, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) with tf.name_scope('loss'): # Compute mean cross entropy loss for each output. loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.logits, labels=flat_targets) self.mean_loss = tf.reduce_mean(loss) with tf.name_scope('loss_monitor'): # Count the number of elements and the sum of mean_loss # from each batch to compute the average loss. count = tf.Variable(1.0, name='count') sum_mean_loss = tf.Variable(1.0, name='sum_mean_loss') self.reset_loss_monitor = tf.group(sum_mean_loss.assign(0.0), count.assign(0.0), name='reset_loss_monitor') self.update_loss_monitor = tf.group( sum_mean_loss.assign(sum_mean_loss + self.mean_loss), count.assign(count + 1), name='update_loss_monitor') with tf.control_dependencies([self.update_loss_monitor]): self.average_loss = sum_mean_loss / count self.ppl = tf.exp(self.average_loss) # Monitor the loss. loss_summary_name = "average loss" ppl_summary_name = "perplexity" average_loss_summary = tf.summary.scalar(loss_summary_name, self.average_loss) ppl_summary = tf.summary.scalar(ppl_summary_name, self.ppl) # Monitor the loss. self.summaries = tf.summary.merge([average_loss_summary, ppl_summary], name='loss_monitor') self.global_step = tf.get_variable( 'global_step', [], initializer=tf.constant_initializer(0.0)) self.learning_rate = tf.constant(learning_rate) if is_training: # learning_rate = tf.train.exponential_decay(1.0, self.global_step, # 5000, 0.1, staircase=True) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm( tf.gradients(self.mean_loss, tvars), self.max_grad_norm) # optimizer = tf.train.GradientDescentOptimizer(learning_rate) # optimizer = tf.train.RMSPropOptimizer(learning_rate, decay_rate) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.train_op = optimizer.apply_gradients( zip(grads, tvars), global_step=self.global_step)