def simple_assertions(self, updates, num_bricks=2, num_updates=4): """Shared assertions for simple tests.""" assert len(updates) == num_updates assert all(is_shared_variable(u[0]) for u in updates) # This order is somewhat arbitrary and implementation_dependent means = set(u[0] for u in updates if has_roles(u[0], [BATCH_NORM_POPULATION_MEAN])) stdevs = set(u[0] for u in updates if has_roles(u[0], [BATCH_NORM_POPULATION_STDEV])) assert means.isdisjoint(stdevs) assert len(set(get_brick(v) for v in means)) == num_bricks assert len(set(get_brick(v) for v in stdevs)) == num_bricks
def get_updates(variables): # this is fugly because we must get the batch stats from the # graph so we get the ones that are *actually being used in # the computation* after graph transforms have been applied updates = [] variables = graph.deep_ancestors(variables) for stat, role in BatchNormalization.roles.items(): from blocks.roles import has_roles batch_stats = [var for var in variables if has_roles(var, [role])] batch_stats = util.dedup(batch_stats, equal=util.equal_computations) batch_stats_by_brick = OrderedDict() for batch_stat in batch_stats: brick = batch_stat.tag.batch_normalization_brick population_stat = brick.population_stats[stat] batch_stats_by_brick.setdefault(brick, []).append(batch_stat) for brick, batch_stats in batch_stats_by_brick.items(): population_stat = brick.population_stats[stat] if len(batch_stats) > 1: # makes sense for recurrent structures logger.warning("averaging multiple population statistic estimates to update %s: %s" % (util.get_path(population_stat), batch_stats)) batch_stat = T.stack(batch_stats).mean(axis=0) updates.append((population_stat, (1 - brick.alpha) * population_stat + brick.alpha * batch_stat)) return updates
def tag_recurrent_dropout(self, variables, recurrent_dropout, rng=None, **hyperparameters): from blocks.roles import OUTPUT, has_roles ancestors = graph.deep_ancestors(variables) for lstm in self.rnn.transitions: variables = [ var for var in ancestors if (has_roles(var, [OUTPUT]) and lstm in var.tag.annotations and var.name.endswith("states")) ] # get one dropout mask for all time steps. use the very # first state to get the hidden state shape, else we get # graph cycles. initial_state = util.the( [var for var in variables if "initial_state" in var.name]) mask = util.get_dropout_mask(initial_state.shape, recurrent_dropout, rng=rng) subsequent_states = [ var for var in variables if "initial_state" not in var.name ] graph.add_transform(subsequent_states, graph.DropoutTransform("recurrent_dropout", mask=mask), reason="regularization")
def __call__(self, variables): """Filter the given variables. Parameters ---------- variables : list of :class:`~tensor.TensorVariable` """ if self.roles: variables = [var for var in variables if has_roles(var, self.roles, self.each_role)] if self.bricks is not None: filtered_variables = [] for var in variables: var_brick = get_brick(var) if var_brick is None: continue for brick in self.bricks: if isclass(brick) and isinstance(var_brick, brick): filtered_variables.append(var) break elif isinstance(brick, Brick) and var_brick is brick: filtered_variables.append(var) break variables = filtered_variables if self.name: variables = [var for var in variables if hasattr(var.tag, 'name') and self.name == var.tag.name] if self.name_regex: variables = [var for var in variables if hasattr(var.tag, 'name') and re.match(self.name_regex, var.tag.name)] if self.theano_name: variables = [var for var in variables if (var.name is not None) and self.theano_name == var.name] if self.theano_name_regex: variables = [var for var in variables if (var.name is not None) and re.match(self.theano_name_regex, var.name)] if self.applications: filtered_variables = [] for var in variables: var_application = get_application_call(var) if var_application is None: continue if (var_application.application in self.applications or var_application.application.application in self.applications): filtered_variables.append(var) variables = filtered_variables if self.call_id: variables = [ var for var in variables if get_application_call(var) and get_application_call(var).metadata['call_id'] == self.call_id] return variables
def get_gradients(self, features, n_samples): """Perform inference and calculate gradients. Returns ------- log_px : T.fvector log_psx : T.fvector gradients : OrderedDict """ p_layers = self.p_layers q_layers = self.q_layers n_layers = len(p_layers) batch_size = features.shape[0] x = replicate_batch(features, n_samples) # Get Q-samples samples, log_p, log_q = self.sample_q(x) # Reshape and sum samples = unflatten_values(samples, batch_size, n_samples) log_p = unflatten_values(log_p, batch_size, n_samples) log_q = unflatten_values(log_q, batch_size, n_samples) log_p_all = sum(log_p) log_q_all = sum(log_q) # Approximate log(p(x)) log_px = logsumexp(log_p_all - log_q_all, axis=-1) - tensor.log(n_samples) log_psx = (logsumexp((log_p_all - log_q_all) / 2, axis=-1) - tensor.log(n_samples)) * 2. # Approximate log p(x) and calculate IS weights w = self.importance_weights(log_p, log_q) wp = w.reshape((batch_size * n_samples, )) wq = w.reshape((batch_size * n_samples, )) wq = wq - (1. / n_samples) samples = flatten_values(samples, batch_size * n_samples) gradients = OrderedDict() for l in xrange(n_layers - 1): gradients = merge_gradients(gradients, p_layers[l].get_gradients(samples[l], samples[l + 1], weights=wp)) gradients = merge_gradients(gradients, q_layers[l].get_gradients(samples[l + 1], samples[l], weights=wq)) gradients = merge_gradients(gradients, p_layers[-1].get_gradients(samples[-1], weights=wp)) if (self.l1reg > 0.) or (self.l2reg > 0.): reg_gradients = OrderedDict() params = Selector(self).get_parameters() for pname, param in params.iteritems(): if has_roles(param, (WEIGHT,)): reg_cost = self.l1reg * tensor.sum(abs(param)) + self.l2reg * tensor.sum(param ** 2) reg_gradients[param] = tensor.grad(reg_cost, param) gradients = merge_gradients(gradients, reg_gradients) return log_px, log_psx, gradients
def __call__(self, variables): """Filter the given variables. Parameters ---------- variables : list of :class:`~tensor.TensorVariable` """ if self.roles: variables = [ var for var in variables if has_roles(var, self.roles, self.each_role) ] if self.bricks is not None: filtered_variables = [] for var in variables: var_brick = get_brick(var) if var_brick is None: continue for brick in self.bricks: if isclass(brick) and isinstance(var_brick, brick): filtered_variables.append(var) break elif isinstance(brick, Brick) and var_brick is brick: filtered_variables.append(var) break variables = filtered_variables if self.name: variables = [ var for var in variables if hasattr(var.tag, 'name') and self.name == var.tag.name ] if self.name_regex: variables = [ var for var in variables if hasattr(var.tag, 'name') and re.match(self.name_regex, var.tag.name) ] if self.theano_name: variables = [ var for var in variables if (var.name is not None) and self.theano_name == var.name ] if self.theano_name_regex: variables = [ var for var in variables if (var.name is not None) and re.match(self.theano_name_regex, var.name) ] if self.applications: variables = [ var for var in variables if get_application_call(var) and get_application_call(var).application in self.applications ] return variables
def tag_convnet_dropout(outputs, rng=None, **kwargs): from blocks.roles import has_roles, OUTPUT cnn_outputs = OrderedDict() for var in theano.gof.graph.ancestors(outputs): if (has_roles(var, [OUTPUT]) and util.annotated_by_a(util.get_convolution_classes(), var)): cnn_outputs.setdefault(util.get_path(var), []).append(var) unique_outputs = [] for path, vars in cnn_outputs.items(): vars = util.dedup(vars, equal=util.equal_computations) unique_outputs.extend(vars) graph.add_transform(unique_outputs, graph.DropoutTransform("convnet_dropout", rng=rng), reason="regularization")
def tag_convnet_dropout(outputs, rng=None, **kwargs): from blocks.roles import has_roles, OUTPUT cnn_outputs = OrderedDict() for var in theano.gof.graph.ancestors(outputs): if (has_roles(var, [OUTPUT]) and util.annotated_by_a( util.get_convolution_classes(), var)): cnn_outputs.setdefault(util.get_path(var), []).append(var) unique_outputs = [] for path, vars in cnn_outputs.items(): vars = util.dedup(vars, equal=util.equal_computations) unique_outputs.append(util.the(vars)) graph.add_transform( unique_outputs, graph.DropoutTransform("convnet_dropout", rng=rng), reason="regularization")
def tag_attention_dropout(self, variables, rng=None, **hyperparameters): from blocks.roles import INPUT, has_roles bricks_ = [ brick for brick in util.all_bricks([self.patch_transform]) if isinstance(brick, (bricks.Linear, conv2d.Convolutional, conv3d.Convolutional)) ] variables = [ var for var in graph.deep_ancestors(variables) if (has_roles(var, [INPUT]) and any(brick in var.tag.annotations for brick in bricks_)) ] graph.add_transform(variables, graph.DropoutTransform("attention_dropout", rng=rng), reason="regularization")
def get_gradients(self, features, n_samples): """Perform inference and calculate gradients. Returns ------- log_px : T.fvector log_psx : T.fvector gradients : OrderedDict """ p_layers = self.p_layers q_layers = self.q_layers n_layers = len(p_layers) batch_size = features.shape[0] x = replicate_batch(features, n_samples) # Get Q-samples samples, log_p, log_q = self.sample_q(x) # Reshape and sum samples = unflatten_values(samples, batch_size, n_samples) log_p = unflatten_values(log_p, batch_size, n_samples) log_q = unflatten_values(log_q, batch_size, n_samples) log_p_all = sum(log_p) log_q_all = sum(log_q) # Approximate log p(x) log_px_bound = log_p_all[:,0] - log_q_all[:,0] log_px = logsumexp(log_p_all-log_q_all, axis=-1) - tensor.log(n_samples) log_psx = (logsumexp((log_p_all-log_q_all)/2, axis=-1) - tensor.log(n_samples)) * 2. # Calculate IS weights w = self.importance_weights(log_p, log_q) wp = w.reshape( (batch_size*n_samples, ) ) wq = w.reshape( (batch_size*n_samples, ) ) wq = wq - (1./n_samples) samples = flatten_values(samples, batch_size*n_samples) gradients = OrderedDict() for l in xrange(n_layers-1): gradients = merge_gradients(gradients, p_layers[l].get_gradients(samples[l], samples[l+1], weights=wp)) gradients = merge_gradients(gradients, q_layers[l].get_gradients(samples[l+1], samples[l], weights=wq)) gradients = merge_gradients(gradients, p_layers[-1].get_gradients(samples[-1], weights=wp)) if (self.l1reg > 0.) or (self.l2reg > 0.): reg_gradients = OrderedDict() params = Selector(self).get_parameters() for pname, param in params.iteritems(): if has_roles(param, (WEIGHT,)): reg_cost = self.l1reg * tensor.sum(abs(param)) + self.l2reg * tensor.sum(param**2) reg_gradients[param] = tensor.grad(reg_cost, param) gradients = merge_gradients(gradients, reg_gradients) self.log_p_bound = log_px_bound self.log_p = log_px self.log_ph = log_psx return log_px, log_psx, gradients
def parameters(self): return [ var for var in self.shared_variables if has_roles(var, [PARAMETER]) ]
def replace(self, replacements): """Replace certain variables in the computation graph. Parameters ---------- replacements : dict The mapping from variables to be replaced to the corresponding substitutes. Examples -------- >>> import theano >>> from theano import tensor, function >>> x = tensor.scalar('x') >>> y = x + 2 >>> z = y + 3 >>> a = z + 5 Let's suppose we have dependent replacements like >>> replacements = {y: x * 2, z: y * 3} >>> cg = ComputationGraph([a]) >>> theano.pprint(a) # doctest: +NORMALIZE_WHITESPACE '(((x + TensorConstant{2}) + TensorConstant{3}) + TensorConstant{5})' >>> cg_new = cg.replace(replacements) >>> theano.pprint( ... cg_new.outputs[0]) # doctest: +NORMALIZE_WHITESPACE '(((x * TensorConstant{2}) * TensorConstant{3}) + TensorConstant{5})' First two sums turned into multiplications >>> float(function(cg_new.inputs, cg_new.outputs)(3.)[0]) 23.0 """ # Due to theano specifics we have to make one replacement in time replacements = OrderedDict(replacements) outputs_cur = self.outputs # `replacements` with previous replacements applied. We have to track # variables in the new graph corresponding to original replacements. replacement_keys_cur = [] replacement_vals_cur = [] # Sort `replacements` in topological order # variables in self.variables are in topological order remaining_replacements = replacements.copy() for variable in self.variables: if variable in replacements: if has_roles(variable, [AUXILIARY]): warnings.warn( "replace method was asked to replace a variable ({}) " "that is an auxiliary variable.".format(variable)) replacement_keys_cur.append(variable) # self.variables should not contain duplicates, # otherwise pop() may fail. replacement_vals_cur.append( remaining_replacements.pop(variable)) # if remaining_replacements is not empty if remaining_replacements: warnings.warn( "replace method was asked to replace a variable(s) ({}) " "that is not a part of the computational " "graph.".format(str(remaining_replacements.keys()))) # Replace step-by-step in topological order while replacement_keys_cur: replace_what = replacement_keys_cur[0] replace_by = replacement_vals_cur[0] # We also want to make changes in future replacements outputs_new = theano.clone(outputs_cur + replacement_keys_cur[1:] + replacement_vals_cur[1:], replace={replace_what: replace_by}) # Reconstruct outputs, keys, and values outputs_cur = outputs_new[:len(outputs_cur)] replacement_keys_cur = outputs_new[len(outputs_cur ):len(outputs_cur) + len(replacement_keys_cur) - 1] replacement_vals_cur = outputs_new[len(outputs_cur) + len(replacement_keys_cur):] return ComputationGraph(outputs_cur)
def auxiliary_variables(self): return [var for var in self.variables if has_roles(var, [AUXILIARY])]
def parameters(self): return [var for var in self.shared_variables if has_roles(var, [PARAMETER])]
def construct_monitors(algorithm, task, model, graphs, outputs, updates, monitor_options, n_spatial_dims, hyperparameters, **kwargs): from blocks.extensions.monitoring import TrainingDataMonitoring, DataStreamMonitoring extensions = [] if "steps" in monitor_options: step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items()]) step_channels.append(algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append(algorithm.total_gradient_norm.copy(name="total_gradient_norm")) logger.warning("constructing training data monitor") extensions.append(TrainingDataMonitoring( step_channels, prefix="train", after_epoch=True)) if "parameters" in monitor_options: data_independent_channels = [] for parameter in graphs["train"].parameters: if parameter.name in "gamma beta W b".split(): quantity = parameter.norm(2) quantity.name = "parameter.norm:%s" % util.get_path(parameter) data_independent_channels.append(quantity) for key in "location_std scale_std".split(): data_independent_channels.append(hyperparameters[key].copy(name="parameter:%s" % key)) extensions.append(DataStreamMonitoring( data_independent_channels, data_stream=None, after_epoch=True)) for which_set in "train test".split(): channels = [] channels.extend(outputs[which_set][key] for key in "cost emitter_cost excursion_cost".split()) channels.extend(outputs[which_set][key] for key in task.monitor_outputs()) channels.append(outputs[which_set]["savings"] .mean().copy(name="mean_savings")) if "theta" in monitor_options: for key in "raw_location raw_scale".split(): for stat in "mean var".split(): channels.append(getattr(outputs[which_set][key], stat)(axis=1) .copy(name="%s.%s" % (key, stat))) if which_set == "train": if "activations" in monitor_options: from blocks.roles import has_roles, OUTPUT cnn_outputs = OrderedDict() for var in theano.gof.graph.ancestors(graphs[which_set].outputs): if (has_roles(var, [OUTPUT]) and util.annotated_by_a( util.get_convolution_classes(), var)): cnn_outputs.setdefault(util.get_path(var), []).append(var) for path, vars in cnn_outputs.items(): vars = util.dedup(vars, equal=util.equal_computations) for i, var in enumerate(vars): channels.append(var.mean().copy( name="activation[%i].mean:%s" % (i, path))) if "batch_normalization" in monitor_options: errors = [] for population_stat, update in updates[which_set]: if population_stat.name.startswith("population"): # this is a super robust way to get the # corresponding batch statistic from the # exponential moving average expression batch_stat = update.owner.inputs[1].owner.inputs[1] errors.append(((population_stat - batch_stat)**2).mean()) if errors: channels.append(T.stack(errors).mean().copy(name="population_statistic_mse")) logger.warning("constructing %s monitor" % which_set) extensions.append(DataStreamMonitoring( channels, prefix=which_set, after_epoch=True, data_stream=task.get_stream(which_set, monitor=True))) return extensions
def construct_monitors(algorithm, task, model, graphs, outputs, plot_url, hyperparameters, **kwargs): from blocks.extensions.monitoring import TrainingDataMonitoring, DataStreamMonitoring from patchmonitor import PatchMonitoring, VideoPatchMonitoring extensions = [] if True: extensions.append( TrainingDataMonitoring([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items() ], prefix="train", after_epoch=True)) if True: data_independent_channels = [] for parameter in graphs["train"].parameters: if parameter.name in "gamma beta W b".split(): quantity = parameter.norm(2) quantity.name = "parameter.norm:%s" % util.get_path(parameter) data_independent_channels.append(quantity) extensions.append( DataStreamMonitoring(data_independent_channels, data_stream=None, after_epoch=True)) for which_set in "train valid test".split(): channels = [] channels.extend(outputs[which_set][key] for key in "cost".split()) channels.extend(outputs[which_set][key] for key in task.monitor_outputs()) if which_set == "train": if True: from blocks.roles import has_roles, OUTPUT cnn_outputs = OrderedDict() for var in theano.gof.graph.ancestors( graphs[which_set].outputs): if (has_roles(var, [OUTPUT]) and util.annotated_by_a( util.get_convolution_classes(), var)): cnn_outputs.setdefault(util.get_path(var), []).append(var) for path, vars in cnn_outputs.items(): vars = util.dedup(vars, equal=util.equal_computations) for i, var in enumerate(vars): channels.append(var.mean().copy( name="activation[%i].mean:%s" % (i, path))) channels.append( algorithm.total_gradient_norm.copy(name="total_gradient_norm")) extensions.append( DataStreamMonitoring(channels, prefix=which_set, after_epoch=True, data_stream=task.get_stream(which_set, monitor=True))) if plot_url: plot_channels = [] plot_channels.extend(task.plot_channels()) plot_channels.append(["train_cost"]) #plot_channels.append(["train_%s" % step_channel.name for step_channel in step_channels]) from blocks.extras.extensions.plot import Plot extensions.append( Plot(name, channels=plot_channels, after_epoch=True, server_url=plot_url)) return extensions
def parameters(self): all_parameters = list(chain( *[i.parameters for i in self.ops if hasattr(i, 'parameters')])) return [i for i in all_parameters if has_roles(i, PARAMETER)]
def parameters(self): return [i for i in self._configuration.parameters if has_roles(i, PARAMETER)]
def _initialize(self): for param in self.params: if has_roles(param, [WEIGHT]): self.weights_init.initialize(param, self.rng) elif has_roles(param, [BIAS]): self.biases_init.initialize(param, self.rng)
def construct_monitors(algorithm, task, model, graphs, outputs, updates, monitor_options, n_spatial_dims, plot_url, hyperparameters, patchmonitor_interval, **kwargs): from blocks.extensions.monitoring import TrainingDataMonitoring, DataStreamMonitoring extensions = [] if "steps" in monitor_options: step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items()]) step_channels.append(algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append(algorithm.total_gradient_norm.copy(name="total_gradient_norm")) from extensions import Compressor for step_rule in algorithm.step_rule.components: if isinstance(step_rule, Compressor): step_channels.append(step_rule.norm.copy(name="compressor.norm")) step_channels.append(step_rule.newnorm.copy(name="compressor.newnorm")) step_channels.append(step_rule.median.copy(name="compressor.median")) step_channels.append(step_rule.ratio.copy(name="compressor.ratio")) step_channels.extend(outputs["train"][key] for key in "cost emitter_cost excursion_cost cross_entropy error_rate".split()) step_channels.extend(util.uniqueify_names_last_resort(util.dedup( (var.mean().copy(name="bn_stat:%s" % util.get_path(var)) for var in graph.deep_ancestors([outputs["train"]["cost"]]) if hasattr(var.tag, "batch_normalization_brick")), equal=util.equal_computations))) logger.warning("constructing training data monitor") extensions.append(TrainingDataMonitoring( step_channels, prefix="iteration", after_batch=True)) if "parameters" in monitor_options: data_independent_channels = [] for parameter in graphs["train"].parameters: if parameter.name in "gamma beta W b".split(): quantity = parameter.norm(2) quantity.name = "parameter.norm:%s" % util.get_path(parameter) data_independent_channels.append(quantity) for key in "location_std scale_std".split(): data_independent_channels.append(hyperparameters[key].copy(name="parameter:%s" % key)) extensions.append(DataStreamMonitoring( data_independent_channels, data_stream=None, after_epoch=True)) for which_set in "train valid test".split(): channels = [] channels.extend(outputs[which_set][key] for key in "cost emitter_cost excursion_cost".split()) channels.extend(outputs[which_set][key] for key in task.monitor_outputs()) channels.append(outputs[which_set]["savings"] .mean().copy(name="mean_savings")) if "theta" in monitor_options: for key in "true_scale raw_location raw_scale".split(): for stat in "mean var".split(): channels.append(getattr(outputs[which_set][key], stat)(axis=1) .copy(name="%s.%s" % (key, stat))) if which_set == "train": if "activations" in monitor_options: from blocks.roles import has_roles, OUTPUT cnn_outputs = OrderedDict() for var in theano.gof.graph.ancestors(graphs[which_set].outputs): if (has_roles(var, [OUTPUT]) and util.annotated_by_a( util.get_convolution_classes(), var)): cnn_outputs.setdefault(util.get_path(var), []).append(var) for path, vars in cnn_outputs.items(): vars = util.dedup(vars, equal=util.equal_computations) for i, var in enumerate(vars): channels.append(var.mean().copy( name="activation[%i].mean:%s" % (i, path))) if "batch_normalization" in monitor_options: errors = [] for population_stat, update in updates[which_set]: if population_stat.name.startswith("population"): # this is a super robust way to get the # corresponding batch statistic from the # exponential moving average expression batch_stat = update.owner.inputs[1].owner.inputs[1] errors.append(((population_stat - batch_stat)**2).mean()) if errors: channels.append(T.stack(errors).mean().copy(name="population_statistic_mse")) logger.warning("constructing %s monitor" % which_set) extensions.append(DataStreamMonitoring( channels, prefix=which_set, after_epoch=True, data_stream=task.get_stream(which_set, monitor=True))) if "patches" in monitor_options: from patchmonitor import PatchMonitoring, VideoPatchMonitoring patchmonitor = None if n_spatial_dims == 2: patchmonitor_klass = PatchMonitoring elif n_spatial_dims == 3: patchmonitor_klass = VideoPatchMonitoring if patchmonitor_klass: for which in "train valid".split(): patch = outputs[which]["patch"] patch = patch.dimshuffle(1, 0, *range(2, patch.ndim)) patch_extractor = theano.function( [outputs[which][key] for key in "x x_shape".split()], [outputs[which][key] for key in "raw_location raw_scale".split()] + [patch]) patchmonitor = patchmonitor_klass( save_to="%s_patches_%s" % (hyperparameters["name"], which), data_stream=task.get_stream(which, shuffle=False, num_examples=10), every_n_batches=patchmonitor_interval, extractor=patch_extractor, map_to_input_space=attention.static_map_to_input_space) patchmonitor.save_patches("patchmonitor_test.png") extensions.append(patchmonitor) if plot_url: plot_channels = [] plot_channels.extend(task.plot_channels()) plot_channels.append(["train_cost"]) #plot_channels.append(["train_%s" % step_channel.name for step_channel in step_channels]) from blocks.extras.extensions.plot import Plot extensions.append(Plot(name, channels=plot_channels, after_epoch=True, server_url=plot_url)) return extensions
def construct_monitors(algorithm, task, model, graphs, outputs, updates, monitor_options, n_spatial_dims, plot_url, hyperparameters, patchmonitor_interval, **kwargs): from blocks.extensions.monitoring import TrainingDataMonitoring, DataStreamMonitoring extensions = [] if "steps" in monitor_options: step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items() ]) step_channels.append( algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append( algorithm.total_gradient_norm.copy(name="total_gradient_norm")) from extensions import Compressor for step_rule in algorithm.step_rule.components: if isinstance(step_rule, Compressor): step_channels.append( step_rule.norm.copy(name="compressor.norm")) step_channels.append( step_rule.newnorm.copy(name="compressor.newnorm")) step_channels.append( step_rule.median.copy(name="compressor.median")) step_channels.append( step_rule.ratio.copy(name="compressor.ratio")) step_channels.extend( outputs["train"][key] for key in "cost emitter_cost excursion_cost cross_entropy error_rate".split( )) step_channels.extend( util.uniqueify_names_last_resort( util.dedup(( var.mean().copy(name="bn_stat:%s" % util.get_path(var)) for var in graph.deep_ancestors([outputs["train"]["cost"]]) if hasattr(var.tag, "batch_normalization_brick")), equal=util.equal_computations))) logger.warning("constructing training data monitor") extensions.append( TrainingDataMonitoring(step_channels, prefix="iteration", after_batch=True)) if "parameters" in monitor_options: data_independent_channels = [] for parameter in graphs["train"].parameters: if parameter.name in "gamma beta W b".split(): quantity = parameter.norm(2) quantity.name = "parameter.norm:%s" % util.get_path(parameter) data_independent_channels.append(quantity) for key in "location_std scale_std".split(): data_independent_channels.append( hyperparameters[key].copy(name="parameter:%s" % key)) extensions.append( DataStreamMonitoring(data_independent_channels, data_stream=None, after_epoch=True)) for which_set in "train valid test".split(): channels = [] channels.extend(outputs[which_set][key] for key in "cost emitter_cost excursion_cost".split()) channels.extend(outputs[which_set][key] for key in task.monitor_outputs()) channels.append( outputs[which_set]["savings"].mean().copy(name="mean_savings")) if "theta" in monitor_options: for key in "true_scale raw_location raw_scale".split(): for stat in "mean var".split(): channels.append( getattr(outputs[which_set][key], stat)(axis=1).copy(name="%s.%s" % (key, stat))) if which_set == "train": if "activations" in monitor_options: from blocks.roles import has_roles, OUTPUT cnn_outputs = OrderedDict() for var in theano.gof.graph.ancestors( graphs[which_set].outputs): if (has_roles(var, [OUTPUT]) and util.annotated_by_a( util.get_convolution_classes(), var)): cnn_outputs.setdefault(util.get_path(var), []).append(var) for path, vars in cnn_outputs.items(): vars = util.dedup(vars, equal=util.equal_computations) for i, var in enumerate(vars): channels.append(var.mean().copy( name="activation[%i].mean:%s" % (i, path))) if "batch_normalization" in monitor_options: errors = [] for population_stat, update in updates[which_set]: if population_stat.name.startswith("population"): # this is a super robust way to get the # corresponding batch statistic from the # exponential moving average expression batch_stat = update.owner.inputs[1].owner.inputs[1] errors.append(((population_stat - batch_stat)**2).mean()) if errors: channels.append( T.stack(errors).mean().copy( name="population_statistic_mse")) logger.warning("constructing %s monitor" % which_set) extensions.append( DataStreamMonitoring(channels, prefix=which_set, after_epoch=True, data_stream=task.get_stream(which_set, monitor=True))) if "patches" in monitor_options: from patchmonitor import PatchMonitoring, VideoPatchMonitoring patchmonitor = None if n_spatial_dims == 2: patchmonitor_klass = PatchMonitoring elif n_spatial_dims == 3: patchmonitor_klass = VideoPatchMonitoring if patchmonitor_klass: for which in "train valid".split(): patch = outputs[which]["patch"] patch = patch.dimshuffle(1, 0, *range(2, patch.ndim)) patch_extractor = theano.function( [outputs[which][key] for key in "x x_shape".split()], [ outputs[which][key] for key in "raw_location raw_scale".split() ] + [patch]) patchmonitor = patchmonitor_klass( save_to="%s_patches_%s" % (hyperparameters["name"], which), data_stream=task.get_stream(which, shuffle=False, num_examples=10), every_n_batches=patchmonitor_interval, extractor=patch_extractor, map_to_input_space=attention.static_map_to_input_space) patchmonitor.save_patches("patchmonitor_test.png") extensions.append(patchmonitor) if plot_url: plot_channels = [] plot_channels.extend(task.plot_channels()) plot_channels.append(["train_cost"]) #plot_channels.append(["train_%s" % step_channel.name for step_channel in step_channels]) from blocks.extras.extensions.plot import Plot extensions.append( Plot(name, channels=plot_channels, after_epoch=True, server_url=plot_url)) return extensions
def replace(self, replacements): """Replace certain variables in the computation graph. Parameters ---------- replacements : dict The mapping from variables to be replaced to the corresponding substitutes. Examples -------- >>> import theano >>> from theano import tensor, function >>> x = tensor.scalar('x') >>> y = x + 2 >>> z = y + 3 >>> a = z + 5 Let's suppose we have dependent replacements like >>> replacements = {y: x * 2, z: y * 3} >>> cg = ComputationGraph([a]) >>> theano.pprint(a) # doctest: +NORMALIZE_WHITESPACE '(((x + TensorConstant{2}) + TensorConstant{3}) + TensorConstant{5})' >>> cg_new = cg.replace(replacements) >>> theano.pprint( ... cg_new.outputs[0]) # doctest: +NORMALIZE_WHITESPACE '(((x * TensorConstant{2}) * TensorConstant{3}) + TensorConstant{5})' First two sums turned into multiplications >>> float(function(cg_new.inputs, cg_new.outputs)(3.)[0]) 23.0 """ # Due to theano specifics we have to make one replacement in time replacements = OrderedDict(replacements) outputs_cur = self.outputs # `replacements` with previous replacements applied. We have to track # variables in the new graph corresponding to original replacements. replacement_keys_cur = [] replacement_vals_cur = [] # Sort `replacements` in topological order # variables in self.variables are in topological order remaining_replacements = replacements.copy() for variable in self.variables: if variable in replacements: if has_roles(variable, [AUXILIARY]): warnings.warn( "replace method was asked to replace a variable ({}) " "that is an auxiliary variable.".format(variable)) replacement_keys_cur.append(variable) # self.variables should not contain duplicates, # otherwise pop() may fail. replacement_vals_cur.append( remaining_replacements.pop(variable)) # if remaining_replacements is not empty if remaining_replacements: warnings.warn( "replace method was asked to replace a variable(s) ({}) " "that is not a part of the computational " "graph.".format(str(remaining_replacements.keys()))) # Replace step-by-step in topological order while replacement_keys_cur: replace_what = replacement_keys_cur[0] replace_by = replacement_vals_cur[0] # We also want to make changes in future replacements outputs_new = theano.clone( outputs_cur + replacement_keys_cur[1:] + replacement_vals_cur[1:], replace={replace_what: replace_by}) # Reconstruct outputs, keys, and values outputs_cur = outputs_new[:len(outputs_cur)] replacement_keys_cur = outputs_new[len(outputs_cur): len(outputs_cur) + len(replacement_keys_cur) - 1] replacement_vals_cur = outputs_new[len(outputs_cur) + len(replacement_keys_cur):] return ComputationGraph(outputs_cur)