def __init__(self, step_rule=None, gradients=None, known_grads=None, consider_constant=None, on_unused_sources='raise', theano_func_kwargs=None, **kwargs): if gradients: kwargs.setdefault("parameters", gradients.keys()) super(GradientDescent, self).__init__(**kwargs) self.gradients = gradients if not self.gradients: logger.info("Taking the cost gradient") self.gradients = dict( equizip(self.parameters, tensor.grad( self.cost, self.parameters, known_grads=known_grads, consider_constant=consider_constant))) logger.info("The cost gradient computation graph is built") else: if known_grads: raise ValueError("known_grads has no effect when gradients " "are passed in") if consider_constant is not None: raise ValueError("consider_constant has no effect when " "gradients are passed in") self.step_rule = step_rule if step_rule else Scale() self.total_gradient_norm = l2_norm( self.gradients.values()).copy(name="total_gradient_norm") self.steps, self.step_rule_updates = ( self.step_rule.compute_steps(self.gradients)) self.total_step_norm = l2_norm( self.steps.values()).copy(name="total_step_norm") self.on_unused_sources = on_unused_sources self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs is not None else dict())
def __init__(self, step_rule=None, gradients=None, known_grads=None, consider_constant=None, on_unused_sources='raise', theano_func_kwargs=None, **kwargs): if gradients: kwargs.setdefault("parameters", gradients.keys()) super(GradientDescent, self).__init__(**kwargs) self.gradients = gradients if not self.gradients: logger.info("Taking the cost gradient") self.gradients = dict( equizip(self.parameters, tensor.grad( self.cost, self.parameters, known_grads=known_grads, consider_constant=consider_constant))) logger.info("The cost gradient computation graph is built") else: if known_grads: raise ValueError("known_grads has no effect when gradients " "are passed in") if consider_constant is not None: raise ValueError("consider_constant has no effect when " "gradients are passed in") self.step_rule = step_rule if step_rule else Scale() self.total_gradient_norm = l2_norm( self.gradients.values()).copy(name="total_gradient_norm") self.steps, self.step_rule_updates = ( self.step_rule.compute_steps(self.gradients)) self.total_step_norm = l2_norm( self.steps.values()).copy(name="total_step_norm") self.on_unused_sources = on_unused_sources self.theano_func_kwargs = (theano_func_kwargs if theano_func_kwargs is not None else dict())
def __init__(self, step_rule=None, gradients=None, known_grads=None, **kwargs): if gradients: kwargs.setdefault("params", gradients.keys()) super(GradientDescent, self).__init__(**kwargs) self.gradients = gradients if not self.gradients: logger.info("Taking the cost gradient") self.gradients = dict( equizip(self.params, tensor.grad(self.cost, self.params, known_grads=known_grads))) logger.info("The cost gradient computation graph is built") else: if known_grads: raise ValueError("known_grads has no effect when gradients " "are passed in") self.step_rule = step_rule if step_rule else Scale() self.total_gradient_norm = named_copy(l2_norm(self.gradients.values()), "total_gradient_norm") self.steps, self.step_rule_updates = ( self.step_rule.compute_steps(self.gradients)) self.total_step_norm = named_copy(l2_norm(self.steps.values()), "total_step_norm")
def __init__(self, step_rule=None, gradients=None, known_grads=None, **kwargs): if gradients: kwargs.setdefault("params", gradients.keys()) super(GradientDescent, self).__init__(**kwargs) self.gradients = gradients if not self.gradients: logger.info("Taking the cost gradient") self.gradients = dict( equizip( self.params, tensor.grad(self.cost, self.params, known_grads=known_grads))) logger.info("The cost gradient computation graph is built") else: if known_grads: raise ValueError("known_grads has no effect when gradients " "are passed in") self.step_rule = step_rule if step_rule else Scale() self.total_gradient_norm = named_copy(l2_norm(self.gradients.values()), "total_gradient_norm") self.steps, self.step_rule_updates = (self.step_rule.compute_steps( self.gradients)) self.total_step_norm = named_copy(l2_norm(self.steps.values()), "total_step_norm")
def test_l2_norm(): assert_allclose(l2_norm([2]).eval(), 2.0) assert_allclose(l2_norm([3, 4]).eval(), 5.0) assert_allclose(l2_norm([3, [1, 2]]).eval(), 14.0**0.5) assert_allclose(l2_norm([3, [1, 2], [[1, 2], [3, 4]]]).eval(), 44.0**0.5) assert_allclose( l2_norm([3, [1, 2], [[1, 2], [3, 4]]], squared=True).eval(), 44.0)
def test_l2_norm(): assert_allclose(l2_norm([2]).eval(), 2.0) assert_allclose(l2_norm([3, 4]).eval(), 5.0) assert_allclose(l2_norm([3, [1, 2]]).eval(), 14.0 ** 0.5) assert_allclose( l2_norm([3, [1, 2], [[1, 2], [3, 4]]]).eval(), 44.0 ** 0.5) assert_allclose( l2_norm([3, [1, 2], [[1, 2], [3, 4]]], squared=True).eval(), 44.0)
def __init__(self, cost, params, subtensor_params={}, step_rule=None, *args, **kwargs): full_params = params self.subtensor_params = subtensor_params # For each LookupTable, we replace it by its subtensors appearing in the graph params = [ param for param in full_params if param not in subtensor_params ] for _, (_, _, outputs, _) in subtensor_params.iteritems(): params.extend(outputs) super(GradientDescent, self).__init__(cost=cost, params=params, **kwargs) # self.params contains the list of outputs of the lookup tables logger.info("Taking the cost gradient") self.gradients = dict( equizip(self.params, tensor.grad(self.cost, self.params))) # We combine the gradients extracted from the same parameter for param, (subparam, canonized_indices, outputs, indices) in subtensor_params.iteritems(): # This is necessary if we want to compute the l2 norm correctly (e.g. for StepClipping) tmp = shared_floatx(param.get_value() * 0.) for (output, indice) in zip(outputs, indices): tmp = tensor.inc_subtensor(tmp[indice], self.gradients[output]) del self.gradients[output] self.gradients[subparam] = tmp[canonized_indices] # We remove the subtensors from the list of parameters self.params = full_params logger.info("The cost gradient computation graph is built") self.step_rule = step_rule if step_rule else Scale() self.total_gradient_norm = named_copy(l2_norm(self.gradients.values()), "total_gradient_norm") self.steps, self.step_rule_updates = (self.step_rule.compute_steps( self.gradients)) self.total_step_norm = named_copy(l2_norm(self.steps.values()), "total_step_norm")
def compute_steps(self, previous_steps): # if not hasattr(self, 'threshold'): # return previous_steps adapt_steps_up = self.adapt_steps + 1.0 # This will quickly converge the estimate for the mean cut_rho_mean = tensor.minimum(self.decay, self.adapt_steps / adapt_steps_up) if self.quick_variance_convergence: cut_rho_mean2 = cut_rho_mean else: cut_rho_mean2 = self.decay gnorm = l2_norm(previous_steps.values()) gnorm_log = tensor.log(l2_norm(previous_steps.values())) # here we quiclky converge the mean gnorm_log_ave_up = (cut_rho_mean * self.gnorm_log_ave + (1. - cut_rho_mean) * gnorm_log) # this can wait as it starts from 0 anyways! gnorm_log2_ave_up = (cut_rho_mean2 * self.gnorm_log2_ave + (1. - cut_rho_mean2) * (gnorm_log ** 2)) clip_threshold_up = tensor.exp( gnorm_log_ave_up + tensor.sqrt(tensor.maximum(0.0, gnorm_log2_ave_up - gnorm_log_ave_up ** 2) ) * self.stdevs) if self.clip_to_mean: clip_level_up = tensor.exp(gnorm_log_ave_up) else: clip_level_up = clip_threshold_up multiplier = tensor.switch(gnorm < clip_threshold_up, 1, clip_level_up / gnorm) steps = OrderedDict( (parameter, step * multiplier) for parameter, step in previous_steps.items()) return steps, [(self.adapt_steps, adapt_steps_up), (self.gnorm_log_ave, gnorm_log_ave_up), (self.gnorm_log2_ave, gnorm_log2_ave_up), (self.clip_threshold, clip_threshold_up), (self.clip_level, clip_level_up)]
def compute_steps(self, previous_steps): if not hasattr(self, "threshold"): return previous_steps norm = l2_norm(previous_steps.values()) multiplier = tensor.switch(norm < self.threshold, 1, self.threshold / norm) steps = OrderedDict((parameter, step * multiplier) for parameter, step in previous_steps.items()) return steps, []
def __init__(self, step_rule=None, gradients=None, **kwargs): super(GradientDescent, self).__init__(**kwargs) self.gradients = gradients if not self.gradients: logger.info("Taking the cost gradient") self.gradients = dict( zip(self.params, tensor.grad(self.cost, self.params))) logger.info("The cost gradient computation graph is built") self.step_rule = step_rule if step_rule else SteepestDescent() self.total_gradient_norm = named_copy(l2_norm(self.gradients.values()), "total_gradient_norm") self.steps, self.step_rule_updates = (self.step_rule.compute_steps( self.gradients)) self.total_step_norm = named_copy(l2_norm(self.steps.values()), "total_step_norm")
def compute_step(self, param, previous_step): grad_norm = l2_norm([previous_step]) not_finite = tensor.or_(tensor.isnan(grad_norm), tensor.isinf(grad_norm)) step = tensor.switch(not_finite, self.scaler * param, previous_step) return step, []
def compute_steps(self, gradients): if not hasattr(self, 'threshold'): return gradients norm = l2_norm(gradients.values()) multiplier = tensor.switch(norm < self.threshold, 1, self.threshold / norm) steps = OrderedDict((param, gradient * multiplier) for param, gradient in gradients.items()) return steps, []
def compute_steps(self, steps): # memorize steps for one time step self.last_steps = OrderedDict() updates = [] for parameter, step in steps.items(): last_step = shared_floatx(parameter.get_value() * 0., "last_step_%s" % parameter.name) add_role(last_step, ALGORITHM_BUFFER) updates.append((last_step, step)) self.last_steps[parameter] = last_step # compare last and current step directions self.cosine = (sum( (step * self.last_steps[parameter]).sum() for parameter, step in steps.items()) / l2_norm(steps.values()) / l2_norm(self.last_steps.values())) return steps, updates
def compute_steps(self, previous_steps): if not hasattr(self, 'threshold'): return previous_steps norm = l2_norm(previous_steps.values()) multiplier = tensor.switch(norm < self.threshold, 1, self.threshold / norm) steps = OrderedDict((param, step * multiplier) for param, step in previous_steps.items()) return steps, []
def compute_steps(self, previous_steps): # if not hasattr(self, 'threshold'): # return previous_steps adapt_steps_up = self.adapt_steps + 1.0 # This will quickly converge the estimate for the mean cut_rho_mean = tensor.minimum(self.decay, self.adapt_steps / adapt_steps_up) if self.quick_variance_convergence: cut_rho_mean2 = cut_rho_mean else: cut_rho_mean2 = self.decay gnorm = l2_norm(previous_steps.values()) gnorm_log = tensor.log(l2_norm(previous_steps.values())) # here we quiclky converge the mean gnorm_log_ave_up = cut_rho_mean * self.gnorm_log_ave + (1.0 - cut_rho_mean) * gnorm_log # this can wait as it starts from 0 anyways! gnorm_log2_ave_up = cut_rho_mean2 * self.gnorm_log2_ave + (1.0 - cut_rho_mean2) * (gnorm_log ** 2) clip_threshold_up = tensor.exp( gnorm_log_ave_up + tensor.sqrt(tensor.maximum(0.0, gnorm_log2_ave_up - gnorm_log_ave_up ** 2)) * self.stdevs ) if self.clip_to_mean: clip_level_up = tensor.exp(gnorm_log_ave_up) else: clip_level_up = clip_threshold_up multiplier = tensor.switch(gnorm < clip_threshold_up, 1, clip_level_up / gnorm) steps = OrderedDict((parameter, step * multiplier) for parameter, step in previous_steps.items()) return ( steps, [ (self.adapt_steps, adapt_steps_up), (self.gnorm_log_ave, gnorm_log_ave_up), (self.gnorm_log2_ave, gnorm_log2_ave_up), (self.clip_threshold, clip_threshold_up), (self.clip_level, clip_level_up), ], )
def compute_steps(self, steps): # memorize steps for one time step self.last_steps = OrderedDict() updates = [] for parameter, step in steps.items(): last_step = shared_floatx( parameter.get_value() * 0., "last_step_%s" % parameter.name) add_role(last_step, ALGORITHM_BUFFER) updates.append((last_step, step)) self.last_steps[parameter] = last_step # compare last and current step directions self.cosine = (sum((step * self.last_steps[parameter]).sum() for parameter, step in steps.items()) / l2_norm(steps.values()) / l2_norm(self.last_steps.values())) return steps, updates
def compute_steps(self, previous_steps): if self.threshold is None: steps = previous_steps else: norm = l2_norm(previous_steps.values()) multiplier = tensor.switch(norm < self.threshold, 1, self.threshold / norm) steps = OrderedDict((parameter, step * multiplier) for parameter, step in previous_steps.items()) return steps, []
def compute_steps(self, previous_steps): if self.threshold is None: steps = previous_steps else: norm = l2_norm(previous_steps.values()) multiplier = tensor.switch(norm < self.threshold, 1, self.threshold / norm) steps = OrderedDict( (parameter, step * multiplier) for parameter, step in previous_steps.items()) return steps, []
def _apply_reg(self, cost, params=None, *args, **kwargs): try: if self.config.l2_norm > 0: cost = cost + self.config.l2_norm * theano_expressions.l2_norm( tensors=[self.hashtag_embed.W, self.word_embed.W])**2 else: pass except Exception: pass return cost
def compute_step(self, parameter, previous_step): if any(ax >= previous_step.ndim for ax in self.axis): raise ValueError("Invalid axis {} for {}, ndim={}".format(self.axis, parameter, previous_step.ndim)) if len(self.axis) == 0: norms = l2_norm([parameter - previous_step]) else: squares = tensor.sqr(parameter - previous_step) norms = tensor.sqrt(reduce(lambda t, a: t.sum(axis=a, keepdims=True), sorted(self.axis), squares)) # We want a step s* that is the same as scaling # (parameter - previous_step) by threshold / norm # when threshold < norm. shrinking_step = parameter - (self.threshold / norms) * (parameter - previous_step) return tensor.switch(norms > self.threshold, shrinking_step, previous_step), ()
def __init__(self, cost, params, subtensor_params={}, step_rule=None, *args, **kwargs): full_params = params self.subtensor_params = subtensor_params # For each LookupTable, we replace it by its subtensors appearing in the graph params = [param for param in full_params if param not in subtensor_params] for _, (_, _, outputs, _) in subtensor_params.iteritems(): params.extend(outputs) super(GradientDescent, self).__init__(cost=cost, params=params, **kwargs) # self.params contains the list of outputs of the lookup tables logger.info("Taking the cost gradient") self.gradients = dict( equizip(self.params, tensor.grad(self.cost, self.params))) # We combine the gradients extracted from the same parameter for param, (subparam, canonized_indices, outputs, indices) in subtensor_params.iteritems(): # This is necessary if we want to compute the l2 norm correctly (e.g. for StepClipping) tmp = shared_floatx(param.get_value() * 0.) for (output, indice) in zip(outputs, indices): tmp = tensor.inc_subtensor(tmp[indice], self.gradients[output]) del self.gradients[output] self.gradients[subparam] = tmp[canonized_indices] # We remove the subtensors from the list of parameters self.params = full_params logger.info("The cost gradient computation graph is built") self.step_rule = step_rule if step_rule else Scale() self.total_gradient_norm = named_copy(l2_norm(self.gradients.values()), "total_gradient_norm") self.steps, self.step_rule_updates = ( self.step_rule.compute_steps(self.gradients)) self.total_step_norm = named_copy(l2_norm(self.steps.values()), "total_step_norm")
def __init__(self, gen_obj, dis_obj, step_rule, model, gen_consider_constant, dis_iter=1, gradient_clip=[-0.01, 0.01], **kwargs): super(AdverserialTraning, self).__init__(**kwargs) self.model = model self.gen_cost = gen_obj self.dis_cost = dis_obj self.dis_iter = dis_iter self._n_call = 0 self.gradient_clip = gradient_clip self.gen_gradients = self._compute_gradients( gen_obj, self.model.gen_params, consider_constant=gen_consider_constant, name='Generator') self.dis_gradients = self._compute_gradients(dis_obj, self.model.dis_params, name='Discriminator') gradient_values = self.dis_gradients + self.gen_gradients self.total_gradient_norm = (l2_norm(gradient_values).copy( name="total_gradient_norm")) self.step_rule = step_rule logger.debug("Computing parameter steps...") self._dis_updates = self._prepare_updates(self.model.dis_params, self.dis_gradients, self.step_rule, name='Discriminator') self._gen_updates = self._prepare_updates(self.model.gen_params, self.gen_gradients, self.step_rule, name='Generator')
def compute_step(self, param, previous_step): if any(ax >= previous_step.ndim for ax in self.axis): raise ValueError("Invalid axis {} for {}, ndim={}".format( self.axis, param, previous_step.ndim)) if len(self.axis) == 0: norms = l2_norm([param - previous_step]) else: squares = tensor.sqr(param - previous_step) norms = tensor.sqrt( reduce(lambda t, a: t.sum(axis=a, keepdims=True), sorted(self.axis), squares)) # We want a step s* that is the same as scaling (param - previous_step) # by threshold / norm when threshold < norm. shrinking_step = (param - (self.threshold / norms) * (param - previous_step)) return tensor.switch(norms > self.threshold, shrinking_step, previous_step), ()
def _apply_reg(self, cost, params=None, *args, **kwargs): ''' Apply regularization (default L2 norm) on parameters (default user, hashtag and word embedding) :param params: A list of parameters to which regularization applied :return: ''' try: if self.config.l2_norm > 0: cost = cost + self.config.l2_norm * theano_expressions.l2_norm( tensors=[ self.user_embed.W, self.hashtag_embed.W, self.word_embed.W ])**2 else: pass except Exception: pass return cost
def l2_regularization(cg, rate=0.01): """compute L2 regularization decay. Parameters ---------- cg : ComputationGraph computation graph for a network rate : float L2 regularization rate Returns ------- L2_cost : expression L2 cost for a network """ W = VariableFilter(roles=[WEIGHT])(cg.variables) L2_cost = rate * l2_norm(W) return L2_cost
def compute_steps(self, previous_steps): def median(window): return tensor.sort(window)[self.window.shape[0] / 2] self.median = median(self.window) # allow within 1 median absolute deviation #self.deviation = median(abs(self.window - self.median)) #self.max_ratio = 1 + self.deviation / self.median self.max_ratio = 1. self.norm = l2_norm(previous_steps.values()) self.ratio = self.norm / self.median acceptable = self.ratio <= self.max_ratio multiplier = ( tensor.switch(acceptable, # smaller steps are used as is 1, # larger steps are pushed down self.norm ** (1 / self.ratio) / self.norm)) self.newnorm = multiplier * self.norm newwindow = tensor.concatenate([ tensor.shape_padleft(self.norm), self.window[:(self.window_width - 1)] ], axis=0) # let the norm affect the median only if it was acceptable # or if the window hasn't been fully populated yet #newwindow = ifelse( # acceptable + (self.window.shape[0] < self.window_width), # tensor.concatenate([ # tensor.shape_padleft(self.norm), # self.window[:(self.window_width - 1)]], # axis=0), # self.window) steps = OrderedDict( (parameter, multiplier * step) for parameter, step in previous_steps.items()) updates = [(self.window, newwindow)] return steps, updates
def __init__(self, cost=None, parameters=None, step_rule=None, gradients=None, known_grads=None, consider_constant=None, **kwargs): # Set initial values for cost, parameters, gradients. self.cost = cost self.parameters = parameters # Coerce lists of tuples to OrderedDict. Do not coerce Mappings, # as we don't want to convert dict -> OrderedDict and give it # an arbitrary, non-deterministic order. if gradients is not None and not isinstance(gradients, Mapping): gradients = OrderedDict(gradients) self.gradients = gradients # If we don't have gradients, we'll need to infer them from the # cost and the parameters, both of which must not be None. if not self.gradients: self.gradients = self._compute_gradients(known_grads, consider_constant) else: if cost is not None: logger.warning(('{}: gradients already specified directly; ' 'cost is unused.' .format(self.__class__.__name__))) if self.parameters is None and isinstance(gradients, OrderedDict): # If the dictionary is ordered, it's safe to use the keys # as they have a deterministic order. self.parameters = list(self.gradients.keys()) elif self.parameters is not None: # If parameters and gradients.keys() don't match we can # try to recover if gradients is ordered. if set(self.parameters) != set(self.gradients.keys()): logger.warn("Specified parameters list does not match " "keys in provided gradient dictionary; " "using parameters inferred from gradients") if not isinstance(self.gradients, OrderedDict): raise ValueError(determinism_error) self.parameters = list(self.gradients.keys()) else: # self.parameters is not None, and gradients isn't # an OrderedDict. We can't do anything safe. raise ValueError(determinism_error) if known_grads: raise ValueError("known_grads has no effect when gradients " "are passed in") if consider_constant is not None: raise ValueError("consider_constant has no effect when " "gradients are passed in") # The order in which the different gradient terms appears # here matters, as floating point addition is non-commutative (and # Theano's graph optimizations are not order-independent). # This is why we do not use .values(). gradient_values = [self.gradients[p] for p in self.parameters] self.total_gradient_norm = (l2_norm(gradient_values) .copy(name="total_gradient_norm")) self.step_rule = step_rule if step_rule else Scale() logger.debug("Computing parameter steps...") self.steps, self.step_rule_updates = ( self.step_rule.compute_steps(self.gradients)) # Same as gradient_values above: the order may influence a # bunch of things, so enforce a consistent one (don't use # .values()). step_values = [self.steps[p] for p in self.parameters] self.total_step_norm = (l2_norm(step_values) .copy(name="total_step_norm")) # Once again, iterating on gradients may not be deterministically # ordered if it is not an OrderedDict. We add the updates here in # the order specified in self.parameters. Keep it this way to # maintain reproducibility. kwargs.setdefault('updates', []).extend( itertools.chain(((parameter, parameter - self.steps[parameter]) for parameter in self.parameters), self.step_rule_updates) ) super(GradientDescent, self).__init__(**kwargs)
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) hidden_units = int(config.get('hyperparams', 'hidden_units', 16)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') fine_tune = config.getboolean('hyperparams', 'fine_tune') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) else: solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13' ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45' right_dim = 10519 left_dim = 11427 train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') l_x = tensor.matrix('l_features') r_x = tensor.matrix('r_features') y = tensor.lmatrix('targets') lnet = load(ln_file).model.get_top_bricks()[0] rnet = load(rn_file).model.get_top_bricks()[0] # Pre-trained layers: # Inputs -> hidden_1 -> hidden 2 for side, net in zip(['l', 'r'], [lnet, rnet]): for child in net.children: child.name = side + '_' + child.name ll1 = lnet.children[0] lr1 = lnet.children[1] ll2 = lnet.children[2] lr2 = lnet.children[3] rl1 = rnet.children[0] rr1 = rnet.children[1] rl2 = rnet.children[2] rr2 = rnet.children[3] l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x)))) r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x)))) input_dim = ll2.output_dim + rl2.output_dim # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output output_mlp = MLP(activations=[ Rectifier(name='h3'), Rectifier(name='h4'), Softmax(name='output'), ], dims=[ input_dim, hidden_units, hidden_units, 2, ], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(std=W_sd, mean=W_mu)) output_mlp.initialize() # # Concatenate the inputs from the two hidden subnets into a single variable # # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) # y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # If no fine-tuning of l-r models is wanted, find the params for only # the joint layers: if fine_tune: params_to_update = dropout_graph.parameters else: params_to_update = VariableFilter([PARAMETER], bricks=output_mlp.children)(cost_graph) # Learning Algorithm: algo = GradientDescent( step_rule=solver_type, params=params_to_update, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream( dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm)], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream( dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring( variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme( test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring( variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('AdniNet_LeftRight', channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], ) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}'.format(stamp), save_separately=['model', 'log'], every_n_epochs=1) # The main loop will train the network and output reports, etc main_loop = MainLoop( data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), FinishIfNoImprovementAfter(notification_name='validation_error', epochs=1), Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float( config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([ AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) else: solver_type = CompositeRule([ RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) input_dim = {'l': 11427, 'r': 10519, 'b': 10519 + 11427} data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [ input for input in inputs if input.name.startswith('linear_') ] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent(step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream(dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([ dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm) ], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream(dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring(variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring(variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('AdniNet_{}'.format(side), channels=[ ['dropout_entropy', 'validation_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}net/{}'.format(side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.1, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop(data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) hidden_units = int(config.get('hyperparams', 'hidden_units', 16)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') fine_tune = config.getboolean('hyperparams', 'fine_tune') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([ AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) else: solver_type = CompositeRule([ RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13' ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45' right_dim = 10519 left_dim = 11427 train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') l_x = tensor.matrix('l_features') r_x = tensor.matrix('r_features') y = tensor.lmatrix('targets') lnet = load(ln_file).model.get_top_bricks()[0] rnet = load(rn_file).model.get_top_bricks()[0] # Pre-trained layers: # Inputs -> hidden_1 -> hidden 2 for side, net in zip(['l', 'r'], [lnet, rnet]): for child in net.children: child.name = side + '_' + child.name ll1 = lnet.children[0] lr1 = lnet.children[1] ll2 = lnet.children[2] lr2 = lnet.children[3] rl1 = rnet.children[0] rr1 = rnet.children[1] rl2 = rnet.children[2] rr2 = rnet.children[3] l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x)))) r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x)))) input_dim = ll2.output_dim + rl2.output_dim # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output output_mlp = MLP(activations=[ Rectifier(name='h3'), Rectifier(name='h4'), Softmax(name='output'), ], dims=[ input_dim, hidden_units, hidden_units, 2, ], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(std=W_sd, mean=W_mu)) output_mlp.initialize() # # Concatenate the inputs from the two hidden subnets into a single variable # # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) # y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [ input for input in inputs if input.name.startswith('linear_') ] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # If no fine-tuning of l-r models is wanted, find the params for only # the joint layers: if fine_tune: params_to_update = dropout_graph.parameters else: params_to_update = VariableFilter( [PARAMETER], bricks=output_mlp.children)(cost_graph) # Learning Algorithm: algo = GradientDescent(step_rule=solver_type, params=params_to_update, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream(dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([ dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm) ], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream(dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring(variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring(variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot( 'AdniNet_LeftRight', channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], ) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}'.format(stamp), save_separately=['model', 'log'], every_n_epochs=1) # The main loop will train the network and output reports, etc main_loop = MainLoop(data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), FinishIfNoImprovementAfter( notification_name='validation_error', epochs=1), Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def training(self, fea2obj, batch_size, learning_rate=0.005, steprule='adagrad', wait_epochs=5, kl_weight_init=None, klw_ep=50, klw_inc_rate=0, num_epochs=None): networkfile = self._config['net'] n_epochs = num_epochs or int(self._config['nepochs']) reg_weight=float(self._config['loss_weight']) reg_type=self._config['loss_reg'] numtrain = int(self._config['num_train']) if 'num_train' in self._config else None train_stream, num_samples_train = get_comb_stream(fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain) dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size=None, shuffle=False) logger.info('sources: %s -- number of train/dev samples: %d/%d', train_stream.sources, num_samples_train, num_samples_dev) t2idx = fea2obj['targets'].t2idx klw_init = kl_weight_init or float(self._config['kld_weight']) if 'kld_weight' in self._config else 1 logger.info('kl_weight_init: %d', klw_init) kl_weight = shared_floatx(klw_init, 'kl_weight') entropy_weight = shared_floatx(1., 'entropy_weight') cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate= build_model_new(fea2obj, len(t2idx), self._config, kl_weight, entropy_weight) cg = ComputationGraph(cost) weights = VariableFilter(roles=[WEIGHT])(cg.parameters) logger.info('Model weights are: %s', weights) if 'L2' in reg_type: cost += reg_weight * l2_norm(weights) logger.info('applying %s with weight: %f ', reg_type, reg_weight) dropout = -0.1 if dropout > 0: cg = apply_dropout(cg, weights, dropout) cost = cg.outputs[0] cost.name = 'cost' logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule, learning_rate) if 'adagrad' in steprule: cnf_step_rule = AdaGrad(learning_rate) elif 'adadelta' in steprule: cnf_step_rule = AdaDelta(decay_rate=0.95) elif 'decay' in steprule: cnf_step_rule = RMSProp(learning_rate=learning_rate, decay_rate=0.90) cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)]) elif 'momentum' in steprule: cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9) elif 'adam' in steprule: cnf_step_rule = Adam(learning_rate=learning_rate) else: logger.info('The steprule param is wrong! which is: %s', steprule) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=cnf_step_rule, on_unused_sources='warn') #algorithm.add_updates(updates) gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight, pat1_recog] train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') dev_monitor = DataStreamMonitoring(variables=[cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") extensions = [dev_monitor, train_monitor, Timing(), TrackTheBest('dev_cost'), FinishIfNoImprovementAfter('dev_cost_best_so_far', epochs=wait_epochs), Printing(after_batch=False), #, ProgressBar() FinishAfter(after_n_epochs=n_epochs), saveload.Load(networkfile+'.toload.pkl'), ] + track_best('dev_cost', networkfile+ '.best.pkl') #extensions.append(SharedVariableModifier(kl_weight, # lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) # extensions.append(SharedVariableModifier(entropy_weight, # lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) logger.info('number of parameters in the model: %d', tensor.sum([p.size for p in cg.parameters]).eval()) logger.info('Lookup table sizes: %s', [p.size.eval() for p in cg.parameters if 'lt' in p.name]) main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
def train(config, save_path, bokeh_name, params, bokeh_server, test_tag, use_load_ext, load_log, fast_start, validation_epochs, validation_batches, per_epochs, per_batches): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) # Build the main brick and initialize all parameters. recognizer = SpeechRecognizer( data.recordings_source, data.labels_source, data.eos_label, data.num_features, data.num_labels, name="recognizer", data_prepend_eos=data.prepend_eos, character_map=data.character_map, **config["net"]) for brick_path, attribute_dict in sorted( config['initialization'].items(), key=lambda (k, v): -k.count('/')): for attribute, value in attribute_dict.items(): brick, = Selector(recognizer).select(brick_path).bricks setattr(brick, attribute, value) brick.push_initialization_config() recognizer.initialize() # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) if params: logger.info("Load parameters from " + params) recognizer.load_params(params) if test_tag: tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__ __stream = data.get_stream("train") __data = next(__stream.get_epoch_iterator(as_dict=True)) recognizer.recordings.tag.test_value = __data[data.recordings_source] recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask'] recognizer.labels.tag.test_value = __data[data.labels_source] recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask'] theano.config.compute_test_value = 'warn' batch_cost = recognizer.get_cost_graph().sum() batch_size = named_copy(recognizer.recordings.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output, = VariableFilter( applications=[r.bottom.apply], name="output")( cost_cg) attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = named_copy(r.recordings.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = named_copy(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = named_copy(attended.shape[0], "max_attended_length") max_num_phonemes = named_copy(r.labels.shape[0], "max_num_phonemes") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_attended = named_copy(abs(attended).mean(), "mean_attended") mean_bottom_output = named_copy(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = named_copy(monotonicity_penalty(weights, r.labels_mask), "weights_penalty") weights_entropy = named_copy(entropy(weights, r.labels_mask), "weights_entropy") mask_density = named_copy(r.labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) regularized_cost = regularized_cg.outputs[0] regularized_weights_penalty = regularized_cg.outputs[1] # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(regularized_cost) params = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, params[key].get_value().shape) for key in sorted(params.keys())], width=120)) # Define the training algorithm. train_conf = config['training'] clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False): logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] algorithm = GradientDescent( cost=regularized_cost + reg_config.get("penalty_coof", .0) * regularized_weights_penalty / batch_size + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2, parameters=params.values(), step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)])) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. observables = regularized_cg.outputs observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in params.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(named_copy(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(named_copy(aggregation.mean( var, recognizer.labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( [observables[0], algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes([cost, weights_entropy, weights_penalty]), data.get_stream("valid"), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=validation_epochs, every_n_batches=validation_batches, after_training=False) extensions.append(validation) recognizer.init_beam_search(10) per = PhonemeErrorRate(recognizer, data.get_dataset("valid")) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=per_epochs, every_n_batches=per_batches, after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_likelihood = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_likelihood, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter(data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']) .add_condition(["after_batch"], _gradient_norm_is_none), # Live plotting: requires launching `bokeh-server` # and allows to see what happens online. Plot(bokeh_name if bokeh_name else os.path.basename(save_path), [# Plot 1: training and validation costs [average_monitoring.record_name(regularized_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]], every_n_batches=10, server_url=bokeh_server), Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_likelihood.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar(), Printing(every_n_batches=1, attribute_filter=PrintingFilterList() )] # Save the config into the status log = TrainingLog() log.status['_config'] = repr(config) main_loop = MainLoop( model=model, log=log, algorithm=algorithm, data_stream=data.get_stream("train"), extensions=extensions) main_loop.run()
def __init__(self, langevin_itr, eta=0.1, alpha=0.75, gamma0=1e-4, hard_limit_on_scopping=True, scoping=1.001, epsilon=1e-4, cost=None, parameters=None, step_rule=None, gradients=None, known_grads=None, consider_constant=None, **kwargs): self.eta = np.float32(eta) self.alpha = np.float32(alpha) self.gamma = theano.shared(np.float32(gamma0), name='ESGD_gamma') self.scoping = np.float32(scoping) self.hard_limit_on_scopping = hard_limit_on_scopping self.epsilon = np.float32(epsilon) self.langevin_itr = langevin_itr self.langevin_step = 1 # Set initial values for cost, parameters, gradients. self.cost = cost self.parameters = parameters # Coerce lists of tuples to OrderedDict. Do not coerce Mappings, # as we don't want to convert dict -> OrderedDict and give it # an arbitrary, non-deterministic order. if gradients is not None and not isinstance(gradients, Mapping): gradients = OrderedDict(gradients) self.gradients = gradients # If we don't have gradients, we'll need to infer them from the # cost and the parameters, both of which must not be None. if not self.gradients: self.gradients = self._compute_gradients(known_grads, consider_constant) else: if cost is not None: logger.warning(('{}: gradients already specified directly; ' 'cost is unused.' .format(self.__class__.__name__))) if self.parameters is None and isinstance(gradients, OrderedDict): # If the dictionary is ordered, it's safe to use the keys # as they have a deterministic order. self.parameters = list(self.gradients.keys()) elif self.parameters is not None: # If parameters and gradients.keys() don't match we can # try to recover if gradients is ordered. if set(self.parameters) != set(self.gradients.keys()): logger.warn("Specified parameters list does not match " "keys in provided gradient dictionary; " "using parameters inferred from gradients") if not isinstance(self.gradients, OrderedDict): raise ValueError(determinism_error) self.parameters = list(self.gradients.keys()) else: # self.parameters is not None, and gradients isn't # an OrderedDict. We can't do anything safe. raise ValueError(determinism_error) if known_grads: raise ValueError("known_grads has no effect when gradients " "are passed in") if consider_constant is not None: raise ValueError("consider_constant has no effect when " "gradients are passed in") # ------------ESGD interception! ----------------- # recreating a two list of parameters of theano shared # they are x_prime and mu in the paper true_parameters = [] mu_parameters = [] for param in self.parameters: new_param = theano.shared(param.get_value(), name=param.name) # same thing but we need a unique object mu_param = theano.shared(param.get_value(), name=param.name) true_parameters += [new_param] mu_parameters += [mu_param] self.true_parameters = true_parameters self.mu_parameters = mu_parameters new_gradients = OrderedDict() #import ipdb; ipdb.set_trace() for true_param, param in zip(true_parameters, self.parameters): gradient = self.gradients[param] new_gradient = gradient - self.gamma * (true_param - param) new_gradients.update({param: new_gradient}) # gradients now contain the ESGD step (line 4 algo 1 of the paper) del self.gradients self.gradients = new_gradients # The order in which the different gradient terms appears # here matters, as floating point addition is non-commutative (and # Theano's graph optimizations are not order-independent). # This is why we do not use .values(). gradient_values = [self.gradients[p] for p in self.parameters] self.total_gradient_norm = (l2_norm(gradient_values) .copy(name="total_gradient_norm")) self.step_rule = step_rule if step_rule else Scale() logger.debug("Computing parameter steps...") self.steps, self.step_rule_updates = ( self.step_rule.compute_steps(self.gradients)) # Same as gradient_values above: the order may influence a # bunch of things, so enforce a consistent one (don't use # .values()). step_values = [self.steps[p] for p in self.parameters] self.total_step_norm = (l2_norm(step_values) .copy(name="total_step_norm")) # Once again, iterating on gradients may not be deterministically # ordered if it is not an OrderedDict. We add the updates here in # the order specified in self.parameters. Keep it this way to # maintain reproducibility. # ---- Another ESGD interception here! ----------------- randrg = theano.tensor.shared_randomstreams.RandomStreams(seed=1234) eps = self.epsilon * randrg.normal(dtype=theano.config.floatX) eta_prime = getattr(self.step_rule, 'learning_rate') slgd_eta_update = theano.tensor.sqrt(eta_prime) * eps kwargs.setdefault('updates', []).extend( itertools.chain(((parameter, parameter - self.steps[parameter] + slgd_eta_update) for parameter in self.parameters), self.step_rule_updates) ) mu_updates = [(mu, np.float32(1. - self.alpha) * mu + self.alpha * x_prime) for mu, x_prime \ in zip(self.mu_parameters, self.parameters)] self.mu_updates = mu_updates super(EntropySGD, self).__init__(**kwargs)
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph(batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter(applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append(rename(gain_matrix.min(), 'min_gain')) primary_observables.append(rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter(applications=[r.generator.readout.readout], name="output_0")(cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")(cost_cg)[-1] attended, = VariableFilter(applications=[r.generator.transition.apply], name="attended")(cost_cg) attended_mask, = VariableFilter(applications=[ r.generator.transition.apply ], name="attended_mask")(cost_cg) weights, = VariableFilter(applications=[r.generator.evaluate], name="weights")(cost_cg) from blocks.roles import AUXILIARY l2_cost, = VariableFilter(roles=[AUXILIARY], theano_name='l2_cost_aux')(cost_cg) cost_forward, = VariableFilter(roles=[AUXILIARY], theano_name='costs_forward_aux')(cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename( abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density ]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [ p for p in cg.parameters if p not in attention_params ] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = ( train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model( regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise')) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1] ] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append( AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [ v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable) ] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat( [name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([ name for name, p in parameters.items() if not p in maxnorm_subjects ])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects) ] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold ] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements**0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5 step_norm = algorithm.steps[param].norm(2) / num_elements**0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length ] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty ] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append( rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append( rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append( Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append( TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables + [l2_cost, cost_forward]), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions(before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append( AdaptiveClipping(algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')).add_condition( ["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [ average_monitoring.record_name(train_cost), validation.record_name(cost) ], # Plot 2: gradient norm, [ average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold) ], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [ average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label') ], # Plot 5: training and validation monotonicity penalty [ average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording') ] ] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server), ] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True).add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension, )).add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension, )), ProgressBar() ] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains(labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name ] extensions.append(Patience(**patience_conf)) extensions.append( Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def initialize_graph(recognizer, data, config, params): # Separate attention_params to be handled differently # when regularization is applied attentions = recognizer.all_children().generator.transition.attention.get() attention_params = [Selector(attention).get_parameters().values() for attention in attentions] logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) observables = [] # monitored each batch cg = recognizer.get_cost_graph(batch=True) labels = [] labels_mask = [] for chld in recognizer.children: lbls = VariableFilter(applications=[chld.cost], name='labels'+chld.names_postfix)(cg) lbls_mask = VariableFilter(applications=[chld.cost], name='labels_mask'+chld.names_postfix)(cg) if len(lbls) == 1: labels += lbls labels_mask += lbls_mask batch_cost = cg.outputs[0].sum() batch_size = rename(labels[0].shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=recognizer.all_children().bottom.apply.get(), name_regex="output")( cost_cg) attended = VariableFilter( applications=recognizer.all_children().generator.transition.apply.get(), name="attended")( cost_cg) attended_mask = VariableFilter( applications=recognizer.all_children().generator.transition.apply.get(), name="attended_mask")( cost_cg) weights = VariableFilter( applications=recognizer.all_children().generator.evaluate.get(), name="weights")( cost_cg) def get_renamed_list(rlist, elem_func, elem_name): return [rename(elem_func(elem), elem_name+chld.names_postfix) for elem,chld in zip(rlist, recognizer.children)] max_sentence_lengths = get_renamed_list(bottom_output, lambda e: e.shape[0], "max_sentence_length") max_attended_mask_lengths = get_renamed_list(attended_mask, lambda e: e.shape[0], "max_attended_mask_length") max_attended_lengths = get_renamed_list(attended, lambda e: e.shape[0], "max_attended_length") max_num_characters = get_renamed_list(labels, lambda e: e.shape[0], "max_num_characters") mean_attended = get_renamed_list(attended, lambda e: abs(e).mean(), "mean_attended") mean_bottom_output = get_renamed_list(bottom_output, lambda e: abs(e).mean(), "mean_bottom_output") mask_density = get_renamed_list(labels_mask, lambda e: e.mean(), "mask_density") weights_entropy = [rename(entropy(w, lm), "weights_entropy"+chld.names_postfix) for w, lm, chld in zip(weights, labels_mask, recognizer.children)] observables += max_attended_lengths + max_attended_mask_lengths + max_sentence_lengths # # Monitoring of cost terms is tricky because of Blocks #514 - since the # costs are annotations that are not part of the original output graph, # they are unaffected by replacements such as dropout!! # cost_terms = [] for chld in recognizer.children: chld_cost_terms = VariableFilter(applications=[chld.generator.evaluate], name_regex='.*_nll')(cost_cg) chld_cost_terms = [rename(var, var.name[:-4] + chld.names_postfix + '_nll') for var in chld_cost_terms] cost_terms += chld_cost_terms cg = ComputationGraph([cost, batch_size] + weights_entropy + mean_attended + mean_bottom_output + max_num_characters + mask_density + cost_terms) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): drop_conf = reg_config['dropout'] bot_drop = drop_conf.get('bottom', 0.0) if bot_drop: logger.info('apply bottom dropout') regularized_cg = apply_dropout(regularized_cg, bottom_output, bot_drop) enc_drop = drop_conf.get('encoder', 0.0) if enc_drop: logger.info('apply encoder dropout') enc_bricks = reduce(lambda acc,x: acc+list(x), recognizer.all_children().encoder.children.get(), []) enc_states = VariableFilter(bricks=enc_bricks, name_regex='states')(regularized_cg) regularized_cg = apply_dropout(regularized_cg, enc_states, enc_drop) post_merge_drop = drop_conf.get('post_merge', 0.0) if post_merge_drop: logger.info('apply post_merge dropout') pm_bricks = [] for chld in recognizer.children: cpm_bricks = list(chld.generator.readout.post_merge.children) cpm_bricks += cpm_bricks[-1].children cpm_bricks = [b for b in cpm_bricks if isinstance(b, type(chld.post_merge_activation))] pm_bricks += cpm_bricks regularized_cg = apply_dropout( regularized_cg, VariableFilter(bricks=pm_bricks, name='output')(regularized_cg), post_merge_drop) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = (train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2) train_cost = train_cost.copy(name='train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=SpeechModel(regularized_cg.outputs[0] ).get_parameter_dict().values(), **reg_config.get('adaptive_noise') ) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1]] # model prior variance if len(cost_terms): # Please note - the aggragation (mean) is done in # "attach_aggregation_schemes" ct_names = [v.name for v in cost_terms] for v in regularized_cg.outputs: if v.name in ct_names: observables.append(rename(v.sum()/batch_size, v.name)) for chld in recognizer.children: if chld.train_tags: tags_cost = VariableFilter(applications=[chld.addTagCost], name='output')(regularized_cg)[0] observables += [rename(tags_cost.sum()/batch_size, 'tags_nll'+chld.names_postfix)] # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise param_values = load_parameter_values(params) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] return { 'observables': observables, 'max_norm_rules': max_norm_rules, 'cg': cg, 'regularized_cg' : regularized_cg, 'train_cost' : train_cost, 'cost' : cost, 'batch_size' : batch_size, 'batch_cost' : batch_cost, 'parameters' : parameters, 'gradients': gradients, 'model' : model, 'data' : data, 'recognizer' : recognizer, 'weights_entropy' : weights_entropy, 'labels_mask' : labels_mask, 'labels' : labels }
def main(mode, save_to, num_epochs, load_params=None, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, stride=None, repeat_times=None, batch_size=None, num_batches=None, algo=None, test_set=None, valid_examples=None, dropout=None, max_norm=None, weight_decay=None, batch_norm=None): if feature_maps is None: feature_maps = [20, 50, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5, 5] if pool_sizes is None: pool_sizes = [2, 2, 2] if repeat_times is None: repeat_times = [1, 1, 1] if batch_size is None: batch_size = 500 if valid_examples is None: valid_examples = 2500 if stride is None: stride = 1 if test_set is None: test_set = 'test' if algo is None: algo = 'rmsprop' if batch_norm is None: batch_norm = False image_size = (128, 128) output_size = 2 if (len(feature_maps) != len(conv_sizes) or len(feature_maps) != len(pool_sizes) or len(feature_maps) != len(repeat_times)): raise ValueError("OMG, inconsistent arguments") # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, stride=stride, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), repeat_times=repeat_times, top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', batch_norm=batch_norm, weights_init=Glorot(), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) single_x = tensor.tensor3('image_features') x = tensor.tensor4('image_features') single_y = tensor.lvector('targets') y = tensor.lmatrix('targets') # Training with batch_normalization(convnet): probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) cg = ComputationGraph([cost, error_rate]) extra_updates = [] if batch_norm: # batch norm: logger.debug("Apply batch norm") pop_updates = get_batch_normalization_updates(cg) # p stands for population mean # m stands for minibatch alpha = 0.005 extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] population_statistics = [p for p, m in extra_updates] if dropout: relu_outputs = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg) cg = apply_dropout(cg, relu_outputs, dropout) cost, error_rate = cg.outputs if weight_decay: logger.debug("Apply weight decay {}".format(weight_decay)) cost += weight_decay * l2_norm(cg.parameters) cost.name = 'cost' # Validation valid_probs = convnet.apply_5windows(single_x) valid_cost = (CategoricalCrossEntropy().apply(single_y, valid_probs) .copy(name='cost')) valid_error_rate = (MisclassificationRate().apply( single_y, valid_probs).copy(name='error_rate')) model = Model([cost, error_rate]) if load_params: logger.info("Loaded params from {}".format(load_params)) with open(load_params, 'r') as src: model.set_parameter_values(load_parameters(src)) # Training stream with random cropping train = DogsVsCats(("train",), subset=slice(None, 25000 - valid_examples, None)) train_str = DataStream( train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size)) train_str = add_transformers(train_str, random_crop=True) # Validation stream without cropping valid = DogsVsCats(("train",), subset=slice(25000 - valid_examples, None, None)) valid_str = DataStream( valid, iteration_scheme=SequentialExampleScheme(valid.num_examples)) valid_str = add_transformers(valid_str) if mode == 'train': directory, _ = os.path.split(sys.argv[0]) env = dict(os.environ) env['THEANO_FLAGS'] = 'floatX=float32' port = numpy.random.randint(1025, 10000) server = subprocess.Popen( [directory + '/server.py', str(25000 - valid_examples), str(batch_size), str(port)], env=env, stderr=subprocess.STDOUT) train_str = ServerDataStream( ('image_features', 'targets'), produces_examples=False, port=port) save_to_base, save_to_extension = os.path.splitext(save_to) # Train with simple SGD if algo == 'rmsprop': step_rule = RMSProp(decay_rate=0.999, learning_rate=0.0003) elif algo == 'adam': step_rule = Adam() else: assert False if max_norm: conv_params = VariableFilter(bricks=[Convolutional], roles=[WEIGHT])(cg) linear_params = VariableFilter(bricks=[Linear], roles=[WEIGHT])(cg) step_rule = CompositeRule( [step_rule, Restrict(VariableClipping(max_norm, axis=0), linear_params), Restrict(VariableClipping(max_norm, axis=(1, 2, 3)), conv_params)]) algorithm = GradientDescent( cost=cost, parameters=model.parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(every_n_batches=100), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [valid_cost, valid_error_rate], valid_str, prefix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), TrackTheBest("valid_error_rate"), Checkpoint(save_to, save_separately=['log'], parameters=cg.parameters + (population_statistics if batch_norm else []), before_training=True, after_epoch=True) .add_condition( ['after_epoch'], OnLogRecord("valid_error_rate_best_so_far"), (save_to_base + '_best' + save_to_extension,)), Printing(every_n_batches=100)] model = Model(cost) main_loop = MainLoop( algorithm, train_str, model=model, extensions=extensions) try: main_loop.run() finally: server.terminate() elif mode == 'test': classify = theano.function([single_x], valid_probs.argmax()) test = DogsVsCats((test_set,)) test_str = DataStream( test, iteration_scheme=SequentialExampleScheme(test.num_examples)) test_str = add_transformers(test_str) correct = 0 with open(save_to, 'w') as dst: print("id", "label", sep=',', file=dst) for index, example in enumerate(test_str.get_epoch_iterator()): image = example[0] prediction = classify(image) print(index + 1, classify(image), sep=',', file=dst) if len(example) > 1 and prediction == example[1]: correct += 1 print(correct / float(test.num_examples)) else: assert False
def construct_monitors(algorithm, task, n_patches, x, x_shape, graph, name, ram, model, cost, n_spatial_dims, plot_url, patchmonitor_interval=100, **kwargs): location, scale, savings = util.get_recurrent_auxiliaries( "location scale savings".split(), graph, n_patches) channels = util.Channels() channels.extend(task.monitor_channels(graph)) channels.append(util.named(savings.mean(), "savings.mean")) for variable_name in "location scale".split(): variable = locals()[variable_name] channels.append(variable.mean(axis=0), "%s.mean" % variable_name) channels.append(variable.var(axis=0), "%s.variance" % variable_name) channels.append(algorithm.total_gradient_norm, "total_gradient_norm") step_norms = util.Channels() step_norms.extend( util.named(l2_norm([algorithm.steps[param]]), "%s.step_norm" % name) for name, param in model.get_parameter_dict().items()) step_channels = step_norms.get_channels() #for activation in VariableFilter(roles=[OUTPUT])(graph.variables): # quantity = activation.mean() # quantity.name = "%s.mean" % util.get_path(activation) # channels.append(quantity) data_independent_channels = util.Channels() for parameter in graph.parameters: if parameter.name in "gamma beta".split(): quantity = parameter.mean() quantity.name = "%s.mean" % util.get_path(parameter) data_independent_channels.append(quantity) extensions = [] extensions.append( TrainingDataMonitoring(step_channels, prefix="train", after_epoch=True)) extensions.append( DataStreamMonitoring(data_independent_channels.get_channels(), data_stream=None, after_epoch=True)) extensions.extend( DataStreamMonitoring((channels.get_channels() + [cost]), data_stream=task.get_stream(which, monitor=True), prefix=which, after_epoch=True) for which in "train valid test".split()) patchmonitor = None if n_spatial_dims == 2: patchmonitor_klass = PatchMonitoring elif n_spatial_dims == 3: patchmonitor_klass = VideoPatchMonitoring if patchmonitor_klass: patch = T.stack(*[ ram.crop(x, x_shape, location[:, i, :], scale[:, i, :]) for i in xrange(n_patches) ]) patch = patch.dimshuffle(1, 0, *range(2, patch.ndim)) patch_extractor = theano.function([x, x_shape], [location, scale, patch]) for which in "train valid".split(): patchmonitor = patchmonitor_klass( save_to="%s_patches_%s" % (name, which), data_stream=task.get_stream(which, shuffle=False, num_examples=5), every_n_batches=patchmonitor_interval, extractor=patch_extractor, map_to_input_space=attention.static_map_to_input_space) patchmonitor.save_patches("patchmonitor_test.png") extensions.append(patchmonitor) if plot_url: plot_channels = [] plot_channels.extend(task.plot_channels()) plot_channels.append(["train_cost"]) #plot_channels.append(["train_%s" % step_channel.name for step_channel in step_channels]) from blocks.extras.extensions.plot import Plot extensions.append( Plot(name, channels=plot_channels, after_epoch=True, server_url=plot_url)) return extensions
def main(job_id, params, config_file='params.ec'): config = ConfigParser.ConfigParser() config.readfp(open('./configs/{}'.format(config_file))) pr = pprint.PrettyPrinter(indent=4) pr.pprint(config) net_name = config.get('hyperparams', 'net_name', 'adni') struct_name = net_name.split('_')[0] max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float(config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') input_dim = input_dims[struct_name] # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) else: solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[ input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent( step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream( dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm)], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream( dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring( variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme( test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring( variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('{}_{}'.format(net_name, side), channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}/{}/{}'.format(struct_name, side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit: # And by that I mean if the means of the val error and training error over the # previous 'epochs' is greater than the 'threshold', we are overfitting. early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.05, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop( data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph( batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter( applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter( applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append( rename(gain_matrix.min(), 'min_gain')) primary_observables.append( rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")( cost_cg)[-1] attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = (train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model(regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise') ) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1]] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append( BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')) .add_condition(["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [average_monitoring.record_name(train_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server),] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar()] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains( labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name] extensions.append(Patience(**patience_conf)) extensions.append(Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def construct_monitors(algorithm, task, n_patches, x, x_shape, graph, name, ram, model, cost, n_spatial_dims, plot_url, patchmonitor_interval=100, **kwargs): location, scale, savings = util.get_recurrent_auxiliaries( "location scale savings".split(), graph, n_patches) channels = util.Channels() channels.extend(task.monitor_channels(graph)) channels.append(util.named(savings.mean(), "savings.mean")) for variable_name in "location scale".split(): variable = locals()[variable_name] channels.append(variable.mean(axis=0), "%s.mean" % variable_name) channels.append(variable.var(axis=0), "%s.variance" % variable_name) channels.append(algorithm.total_gradient_norm, "total_gradient_norm") step_norms = util.Channels() step_norms.extend(util.named(l2_norm([algorithm.steps[param]]), "%s.step_norm" % name) for name, param in model.get_parameter_dict().items()) step_channels = step_norms.get_channels() #for activation in VariableFilter(roles=[OUTPUT])(graph.variables): # quantity = activation.mean() # quantity.name = "%s.mean" % util.get_path(activation) # channels.append(quantity) data_independent_channels = util.Channels() for parameter in graph.parameters: if parameter.name in "gamma beta".split(): quantity = parameter.mean() quantity.name = "%s.mean" % util.get_path(parameter) data_independent_channels.append(quantity) extensions = [] extensions.append(TrainingDataMonitoring( step_channels, prefix="train", after_epoch=True)) extensions.append(DataStreamMonitoring(data_independent_channels.get_channels(), data_stream=None, after_epoch=True)) extensions.extend(DataStreamMonitoring((channels.get_channels() + [cost]), data_stream=task.get_stream(which, monitor=True), prefix=which, after_epoch=True) for which in "train valid test".split()) patchmonitor = None if n_spatial_dims == 2: patchmonitor_klass = PatchMonitoring elif n_spatial_dims == 3: patchmonitor_klass = VideoPatchMonitoring if patchmonitor_klass: patch = T.stack(*[ ram.crop(x, x_shape, location[:, i, :], scale[:, i, :]) for i in xrange(n_patches)]) patch = patch.dimshuffle(1, 0, *range(2, patch.ndim)) patch_extractor = theano.function([x, x_shape], [location, scale, patch]) for which in "train valid".split(): patchmonitor = patchmonitor_klass( save_to="%s_patches_%s" % (name, which), data_stream=task.get_stream(which, shuffle=False, num_examples=5), every_n_batches=patchmonitor_interval, extractor=patch_extractor, map_to_input_space=attention.static_map_to_input_space) patchmonitor.save_patches("patchmonitor_test.png") extensions.append(patchmonitor) if plot_url: plot_channels = [] plot_channels.extend(task.plot_channels()) plot_channels.append(["train_cost"]) #plot_channels.append(["train_%s" % step_channel.name for step_channel in step_channels]) from blocks.extras.extensions.plot import Plot extensions.append(Plot(name, channels=plot_channels, after_epoch=True, server_url=plot_url)) return extensions
def __init__(self, cost=None, parameters=None, step_rule=None, gradients=None, known_grads=None, consider_constant=None, **kwargs): # Set initial values for cost, parameters, gradients. self.cost = cost self.parameters = parameters self.gradients = gradients # If we don't have gradients, we'll need to infer them from the # cost and the parameters, both of which must not be None. if not self.gradients: self.gradients = self._compute_gradients(known_grads, consider_constant) else: if cost is not None: logger.warning( ('{}: gradients already specified directly; ' 'cost is unused.'.format(self.__class__.__name__))) if self.parameters is None and isinstance(gradients, OrderedDict): # If the dictionary is ordered, it's safe to use the keys # as they have a deterministic order. self.parameters = list(self.gradients.keys()) elif self.parameters is not None: # If parameters and gradients.keys() don't match we can # try to recover if gradients is ordered. if set(self.parameters) != set(self.gradients.keys()): logger.warn("Specified parameters list does not match " "keys in provided gradient dictionary; " "using parameters inferred from gradients") if not isinstance(self.gradients, OrderedDict): raise ValueError(determinism_error) self.parameters = list(self.gradients.keys()) else: # self.parameters is not None, and gradients isn't # an OrderedDict. We can't do anything safe. raise ValueError(determinism_error) if known_grads: raise ValueError("known_grads has no effect when gradients " "are passed in") if consider_constant is not None: raise ValueError("consider_constant has no effect when " "gradients are passed in") # The order in which the different gradient terms appears # here matters, as floating point addition is non-commutative (and # Theano's graph optimizations are not order-independent). # This is why we do not use .values(). gradient_values = [self.gradients[p] for p in self.parameters] self.total_gradient_norm = (l2_norm(gradient_values).copy( name="total_gradient_norm")) self.step_rule = step_rule if step_rule else Scale() logger.debug("Computing parameter steps...") self.steps, self.step_rule_updates = (self.step_rule.compute_steps( self.gradients)) # Same as gradient_values above: the order may influence a # bunch of things, so enforce a consistent one (don't use # .values()). step_values = [self.steps[p] for p in self.parameters] self.total_step_norm = (l2_norm(step_values).copy( name="total_step_norm")) # Once again, iterating on gradients may not be deterministically # ordered if it is not an OrderedDict. We add the updates here in # the order specified in self.parameters. Keep it this way to # maintain reproducibility. kwargs.setdefault('updates', []).extend( itertools.chain(((parameter, parameter - self.steps[parameter]) for parameter in self.parameters), self.step_rule_updates)) super(GradientDescent, self).__init__(**kwargs)
def training(self, fea2obj, batch_size, learning_rate=0.005, steprule='adagrad', wait_epochs=5, kl_weight_init=None, klw_ep=50, klw_inc_rate=0, num_epochs=None): networkfile = self._config['net'] n_epochs = num_epochs or int(self._config['nepochs']) reg_weight = float(self._config['loss_weight']) reg_type = self._config['loss_reg'] numtrain = int( self._config['num_train']) if 'num_train' in self._config else None train_stream, num_samples_train = get_comb_stream( fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain) dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size=None, shuffle=False) logger.info('sources: %s -- number of train/dev samples: %d/%d', train_stream.sources, num_samples_train, num_samples_dev) t2idx = fea2obj['targets'].t2idx klw_init = kl_weight_init or float( self._config['kld_weight']) if 'kld_weight' in self._config else 1 logger.info('kl_weight_init: %d', klw_init) kl_weight = shared_floatx(klw_init, 'kl_weight') entropy_weight = shared_floatx(1., 'entropy_weight') cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate = build_model_new( fea2obj, len(t2idx), self._config, kl_weight, entropy_weight) cg = ComputationGraph(cost) weights = VariableFilter(roles=[WEIGHT])(cg.parameters) logger.info('Model weights are: %s', weights) if 'L2' in reg_type: cost += reg_weight * l2_norm(weights) logger.info('applying %s with weight: %f ', reg_type, reg_weight) dropout = -0.1 if dropout > 0: cg = apply_dropout(cg, weights, dropout) cost = cg.outputs[0] cost.name = 'cost' logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule, learning_rate) if 'adagrad' in steprule: cnf_step_rule = AdaGrad(learning_rate) elif 'adadelta' in steprule: cnf_step_rule = AdaDelta(decay_rate=0.95) elif 'decay' in steprule: cnf_step_rule = RMSProp(learning_rate=learning_rate, decay_rate=0.90) cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)]) elif 'momentum' in steprule: cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9) elif 'adam' in steprule: cnf_step_rule = Adam(learning_rate=learning_rate) else: logger.info('The steprule param is wrong! which is: %s', steprule) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=cnf_step_rule, on_unused_sources='warn') #algorithm.add_updates(updates) gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [ cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight, pat1_recog ] train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') dev_monitor = DataStreamMonitoring(variables=[ cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate ], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") extensions = [ dev_monitor, train_monitor, Timing(), TrackTheBest('dev_cost'), FinishIfNoImprovementAfter('dev_cost_best_so_far', epochs=wait_epochs), Printing(after_batch=False), #, ProgressBar() FinishAfter(after_n_epochs=n_epochs), saveload.Load(networkfile + '.toload.pkl'), ] + track_best('dev_cost', networkfile + '.best.pkl') #extensions.append(SharedVariableModifier(kl_weight, # lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) # extensions.append(SharedVariableModifier(entropy_weight, # lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) logger.info('number of parameters in the model: %d', tensor.sum([p.size for p in cg.parameters]).eval()) logger.info('Lookup table sizes: %s', [p.size.eval() for p in cg.parameters if 'lt' in p.name]) main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
def _apply_reg(self, params=None, *args, **kwargs): ''' Apply regularization (default L2 norm) on parameters (default user, hashtag and word embedding) to computing graph of self.cg_generator :param params: A list of parameters to which regularization applied ''' if self.norm_type is not None: params = [self.input_lookup.W] if self.norm_type == 'l2_norm': self._train_cg_generator = self._train_cg_generator + self.norm_scale * theano_expressions.l2_norm( tensors=params)**2 elif self.norm_type == 'l1_norm': norm = 0. for param in params: norm += tensor.abs_(param).sum() self._train_cg_generator = self._train_cg_generator + self.norm_scale * norm else: raise ValueError('{0} norm type is not supported!'.format( self.norm_type)) else: pass
# Define a cost function to optimize, and a classification error rate: # Also apply the outputs from the net, and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # Need to define the computation graph: graph = ComputationGraph(cost) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(graph.variables) # Add some regularization to this model: lam = 0.001 cost += lam * l2_norm(W) cost.name = 'entropy' # This is the model without dropout, but with l2 reg. model = Model(cost) # Apply dropout to inputs: graph = ComputationGraph(y_hat) inputs = VariableFilter([INPUT])(graph.variables) dropout_graph = apply_dropout(graph, inputs, 0.2) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm: algo = GradientDescent( # step_rule=Scale(learning_rate=0.1),
# Define a cost function to optimize, and a classification error rate: # Also apply the outputs from the net, and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # Need to define the computation graph: graph = ComputationGraph(cost) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(graph.variables) # Add some regularization to this model: lam = 0.001 cost += lam * l2_norm(W) cost.name = 'entropy' # This is the model without dropout, but with l2 reg. model = Model(cost) # Apply dropout to inputs: graph = ComputationGraph(y_hat) inputs = VariableFilter([INPUT])(graph.variables) dropout_graph = apply_dropout(graph, inputs, 0.2) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm: algo = GradientDescent( # step_rule=Scale(learning_rate=0.1),
# Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = BinaryCrossEntropy().apply(x, x_hat) # This is the model: before applying dropout autoencoder = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, dropout_inputs, dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm: algo = GradientDescent( step_rule=solver_type,