def __init__(self, cost, generation_length, dataset, initial_text_length, softmax_sampling, updates, ploting_path=None, interactive_mode=False, **kwargs): self.generation_length = generation_length self.init_length = initial_text_length self.dataset = dataset self.output_size = get_output_size(dataset) self.ploting_path = ploting_path self.softmax_sampling = softmax_sampling self.interactive_mode = interactive_mode self.has_indices = has_indices(dataset) super(TextGenerationExtension, self).__init__(**kwargs) # Get presoft and its computation graph filter_presoft = VariableFilter(theano_name="presoft") presoft = filter_presoft(ComputationGraph(cost).variables) cg = ComputationGraph(presoft) # Handle the theano shared variables that allow carrying the hidden # state givens, f_updates = carry_hidden_state(updates, 1, reset=not(self.has_indices)) # Compile the theano function self.generate = theano.function(inputs=cg.inputs, outputs=presoft, givens=givens, updates=f_updates)
def fine_tuning(cost, args): param_values = load_parameter_values(args.load_path) output_size = get_output_size(args.dataset) param_values[ "/output_layer.W"] = np.concatenate(( param_values["/output_layer.W"], 0.1 * np.random.randn(args.state_dim, output_size).astype(np.float32))) model = Model(cost) model.set_parameter_values(param_values) return cost
def get_presoft(h, args): output_size = get_output_size(args.dataset) # If args.skip_connections: dim = args.layers * args.state_dim # else: dim = args.state_dim use_all_states = args.skip_connections or args.skip_output or (args.rnn_type in ["clockwork", "soft"]) output_layer = Linear( input_dim=use_all_states * args.layers * args.state_dim + (1 - use_all_states) * args.state_dim, output_dim=output_size, name="output_layer") output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() presoft = output_layer.apply(h) if not has_indices(args.dataset): presoft = Tanh().apply(presoft) presoft.name = 'presoft' return presoft
def get_presoft(h, args): output_size = get_output_size(args.dataset) # If args.skip_connections: dim = args.layers * args.state_dim # else: dim = args.state_dim use_all_states = args.skip_connections or args.skip_output or ( args.rnn_type in ["clockwork", "soft"]) output_layer = Linear( input_dim=use_all_states * args.layers * args.state_dim + (1 - use_all_states) * args.state_dim, output_dim=output_size, name="output_layer") output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() presoft = output_layer.apply(h) if not has_indices(args.dataset): presoft = Tanh().apply(presoft) presoft.name = 'presoft' return presoft
def __init__(self, cost, generation_length, dataset, initial_text_length, softmax_sampling, updates, ploting_path=None, interactive_mode=False, **kwargs): self.generation_length = generation_length self.init_length = initial_text_length self.dataset = dataset self.output_size = get_output_size(dataset) self.ploting_path = ploting_path self.softmax_sampling = softmax_sampling self.interactive_mode = interactive_mode self.has_indices = has_indices(dataset) super(TextGenerationExtension, self).__init__(**kwargs) # Get presoft and its computation graph filter_presoft = VariableFilter(theano_name="presoft") presoft = filter_presoft(ComputationGraph(cost).variables) cg = ComputationGraph(presoft) # Handle the theano shared variables that allow carrying the hidden # state givens, f_updates = carry_hidden_state(updates, 1, reset=not (self.has_indices)) # Compile the theano function self.generate = theano.function(inputs=cg.inputs, outputs=presoft, givens=givens, updates=f_updates)
def visualize_generate(cost, hidden_states, updates, train_stream, valid_stream, args): use_indices = has_indices(args.dataset) output_size = get_output_size(args.dataset) # Get presoft and its computation graph filter_presoft = VariableFilter(theano_name="presoft") presoft = filter_presoft(ComputationGraph(cost).variables)[0] cg = ComputationGraph(presoft) # Handle the theano shared variables that allow carrying the hidden # state givens, f_updates = carry_hidden_state(updates, 1, reset=not(use_indices)) if args.hide_all_except is not None: pass # Compile the theano function compiled = theano.function(inputs=cg.inputs, outputs=presoft, givens=givens, updates=f_updates) epoch_iterator = train_stream.get_epoch_iterator() for num in range(10): all_ = next(epoch_iterator) all_sequence = all_[0][:, 0:1] targets = all_[1][:, 0:1] # In the case of characters and text if use_indices: init_ = all_sequence[:args.initial_text_length] # Time X Features probability_array = np.zeros((0, output_size)) generated_text = init_ for i in range(args.generated_text_lenght): presoft = compiled(generated_text) # Get the last value of presoft last_presoft = presoft[-1:, 0, :] # Compute the probability distribution probabilities = softmax(last_presoft) # Store it in the list probability_array = np.vstack([probability_array, probabilities]) # Sample a character out of the probability distribution argmax = (args.softmax_sampling == 'argmax') last_output_sample = sample(probabilities, argmax)[:, None, :] # Concatenate the new value to the text generated_text = np.vstack( [generated_text, last_output_sample]) ploting_path = None if args.save_path is not None: ploting_path = os.path.join( args.save_path, 'prob_plot.png') # Convert with real characters whole_sentence = conv_into_char( generated_text[:, 0], args.dataset) initial_sentence = whole_sentence[:init_.shape[0]] selected_sentence = whole_sentence[init_.shape[0]:] logger.info(''.join(initial_sentence) + '...') logger.info(''.join(whole_sentence)) if ploting_path is not None: probability_plot(probability_array, selected_sentence, args.dataset, ploting_path) # In the case of sine wave dataset for example else: presoft = compiled(all_sequence) time_plot = presoft.shape[0] - 1 plt.plot(np.arange(time_plot), targets[:time_plot, 0, 0], label="target") plt.plot(np.arange(time_plot), presoft[:time_plot, 0, 0], label="predicted") plt.legend() plt.grid(True) plt.show()
def visualize_generate(cost, hidden_states, updates, train_stream, valid_stream, args): use_indices = has_indices(args.dataset) output_size = get_output_size(args.dataset) # Get presoft and its computation graph filter_presoft = VariableFilter(theano_name="presoft") presoft = filter_presoft(ComputationGraph(cost).variables)[0] cg = ComputationGraph(presoft) # Handle the theano shared variables that allow carrying the hidden # state givens, f_updates = carry_hidden_state(updates, 1, reset=not(use_indices)) # Compile the theano function compiled = theano.function(inputs=cg.inputs, outputs=presoft, givens=givens, updates=f_updates) epoch_iterator = train_stream.get_epoch_iterator() for num in range(10): all_ = next(epoch_iterator) all_sequence = all_[0][:, 0:1] targets = all_[1][:, 0:1] # In the case of characters and text if use_indices: init_ = all_sequence[:args.initial_text_length] # Time X Features probability_array = np.zeros((0, output_size)) generated_text = init_ for i in range(args.generated_text_lenght): presoft = compiled(generated_text) # Get the last value of presoft last_presoft = presoft[-1:, 0, :] # Compute the probability distribution probabilities = softmax(last_presoft) # Store it in the list probability_array = np.vstack([probability_array, probabilities]) # Sample a character out of the probability distribution argmax = (args.softmax_sampling == 'argmax') last_output_sample = sample(probabilities, argmax)[:, None, :] # Concatenate the new value to the text generated_text = np.vstack( [generated_text, last_output_sample]) ploting_path = None if args.save_path is not None: ploting_path = os.path.join( args.save_path, 'prob_plot.png') # Convert with real characters whole_sentence = conv_into_char( generated_text[:, 0], args.dataset) initial_sentence = whole_sentence[:init_.shape[0]] selected_sentence = whole_sentence[init_.shape[0]:] logger.info(''.join(initial_sentence) + '...') logger.info(''.join(whole_sentence)) if ploting_path is not None: probability_plot(probability_array, selected_sentence, args.dataset, ploting_path) # In the case of sine wave dataset for example else: presoft = compiled(all_sequence) time_plot = presoft.shape[0] - 1 plt.plot(np.arange(time_plot), targets[:time_plot, 0, 0], label="target") plt.plot(np.arange(time_plot), presoft[:time_plot, 0, 0], label="predicted") plt.legend() plt.grid(True) plt.show()
def get_prernn(args): # time x batch x_mask = tensor.fmatrix('mask') # Compute the state dim if args.rnn_type == 'lstm': state_dim = 4 * args.state_dim else: state_dim = args.state_dim # Prepare the arguments for the fork output_names = [] output_dims = [] for d in range(args.layers): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or args.skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) # Prepare the brick to be forked (LookupTable or Linear) # Check if the dataset provides indices (in the case of a # fixed vocabulary, x is 2D tensor) or if it gives raw values # (x is 3D tensor) if has_indices(args.dataset): features = args.mini_batch_size x = tensor.lmatrix('features') vocab_size = get_output_size(args.dataset) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) forked = FeedforwardSequence([lookup.apply]) if not has_mask(args.dataset): x_mask = tensor.ones_like(x, dtype=floatX) else: x = tensor.tensor3('features', dtype=floatX) if args.used_inputs is not None: x = tensor.set_subtensor(x[args.used_inputs:, :, :], tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX)) features = get_output_size(args.dataset) forked = Linear(input_dim=features, output_dim=state_dim) forked.weights_init = initialization.IsotropicGaussian(0.1) forked.biases_init = initialization.Constant(0) if not has_mask(args.dataset): x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX) # Define the fork fork = Fork(output_names=output_names, input_dim=features, output_dims=output_dims, prototype=forked) fork.initialize() # Apply the fork prernn = fork.apply(x) # Give a name to the input of each layer if args.skip_connections: for t in range(len(prernn)): prernn[t].name = "pre_rnn_" + str(t) else: prernn.name = "pre_rnn" return prernn, x_mask
def get_prernn(args): # time x batch x_mask = tensor.fmatrix('mask') # Compute the state dim if args.rnn_type == 'lstm': state_dim = 4 * args.state_dim else: state_dim = args.state_dim # Prepare the arguments for the fork output_names = [] output_dims = [] for d in range(args.layers): if d > 0: suffix = RECURRENTSTACK_SEPARATOR + str(d) else: suffix = '' if d == 0 or args.skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) # Prepare the brick to be forked (LookupTable or Linear) # Check if the dataset provides indices (in the case of a # fixed vocabulary, x is 2D tensor) or if it gives raw values # (x is 3D tensor) if has_indices(args.dataset): features = args.mini_batch_size x = tensor.lmatrix('features') vocab_size = get_output_size(args.dataset) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) forked = FeedforwardSequence([lookup.apply]) if not has_mask(args.dataset): x_mask = tensor.ones_like(x, dtype=floatX) else: x = tensor.tensor3('features', dtype=floatX) if args.used_inputs is not None: x = tensor.set_subtensor( x[args.used_inputs:, :, :], tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX)) features = get_output_size(args.dataset) forked = Linear(input_dim=features, output_dim=state_dim) forked.weights_init = initialization.IsotropicGaussian(0.1) forked.biases_init = initialization.Constant(0) if not has_mask(args.dataset): x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX) # Define the fork fork = Fork(output_names=output_names, input_dim=features, output_dims=output_dims, prototype=forked) fork.initialize() # Apply the fork prernn = fork.apply(x) # Give a name to the input of each layer if args.skip_connections: for t in range(len(prernn)): prernn[t].name = "pre_rnn_" + str(t) else: prernn.name = "pre_rnn" return prernn, x_mask
def run_visualizations(cost, updates, train_stream, valid_stream, args, hidden_states=None, gate_values=None): # Load the parameters from a dumped model assert args.load_path is not None param_values = load_parameter_values(args.load_path) if args.hide_all_except is not None: i = args.hide_all_except sdim = args.state_dim output_size = get_output_size(args.dataset) hidden = np.zeros((args.layers * sdim, output_size), dtype=np.float32) output_w = param_values["/output_layer.W"] hidden[i * sdim:(i + 1) * sdim, :] = output_w[i * sdim:(i + 1) * sdim, :] param_values["/output_layer.W"] = hidden model = Model(cost) model.set_parameter_values(param_values) # Run a visualization if args.visualize == "generate": visualize_generate(cost, hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "gates" and (gate_values is not None): if args.rnn_type == "lstm": visualize_gates_lstm(gate_values, hidden_states, updates, train_stream, valid_stream, args) elif args.rnn_type == "soft": visualize_gates_soft(gate_values, hidden_states, updates, train_stream, valid_stream, args) else: assert False elif args.visualize == "states": visualize_states(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "gradients": visualize_gradients(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "jacobian": visualize_jacobian(hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "presoft": visualize_presoft(cost, hidden_states, updates, train_stream, valid_stream, args) elif args.visualize == "matrices": visualize_matrices(args) elif args.visualize == "trained_singular_values": visualize_singular_values(args) elif args.visualize == "gradients_flow_pie": visualize_gradients_flow_pie(hidden_states, updates, args) else: assert False