def get_lr_scalers(self): rval = OrderedDict() params = self.get_params() for layer in self.layers[:-1]: contrib = layer.get_lr_scalers() assert isinstance(contrib, OrderedDict) # No two layers can contend to scale a parameter assert not any([key in rval for key in contrib]) # Don't try to scale anything that's not a parameter assert all([key in params for key in contrib]) rval.update(contrib) for layer in self.layers[-1]: contrib = layer.get_lr_scalers() assert isinstance(contrib, OrderedDict) # No two layers can contend to scale a parameter assert not any([key in rval for key in contrib]) # Don't try to scale anything that's not a parameter assert all([key in params for key in contrib]) rval.update(contrib) assert all([isinstance(val, float) for val in rval.values()]) return rval
def get_gradients(self, model, data, ** kwargs): #print 'get_gradients' chain_start = theano.shared(numpy.zeros(shape=(self.chain_num, model.n_vis), dtype=theano.config.floatX), name='chain_start', borrow=True) [act_hids, hid_mfs, hid_samples, act_vis, vis_mfs, vis_samples], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, outputs_info=[None, None, None, None, None, chain_start], non_sequences=None, n_steps=self.k) chain_end = vis_samples[-1] scan_updates[chain_start] = chain_end pos_v = data cost = -(- model.free_energy(pos_v).mean() + model.free_energy(chain_end).mean()) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, chain_end]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() updates.update(scan_updates) # manual added return gradients, updates
def get_gradients(self, model, data, ** kwargs): #print 'get_gradients' pos_v = data #pos_h = model.sample_h_given_v(pos_v)[-1] #chain_start = pos_v #h_samples = pos_h #print 'v_samples', v_samples.ndim [act_hids, hid_mfs, hid_samples, act_vis, vis_mfs, vis_samples], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, outputs_info=[None, None, None, None, None, pos_v], non_sequences=None, n_steps=self.k) neg_v = vis_samples[-1] #neg_h = hid_samples[-1] cost = -(- model.free_energy(pos_v).mean() + model.free_energy(neg_v).mean()) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, neg_v]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() updates.update(scan_updates) # add scan_updates return gradients, updates
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): W, = self.transformer.get_params() assert W.ndim == 4 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=(1, 2, 3))) rval = OrderedDict([ ('kernel_norms_min', row_norms.min()), ('kernel_norms_mean', row_norms.mean()), ('kernel_norms_max', row_norms.max()), ]) orval = super(CudNNElemwise, self).get_layer_monitoring_channels( state_below, state, targets) rval.update(orval) cst = self.cost orval = self.nonlin.get_monitoring_channels_from_state(state, targets, cost_fn=cst) rval.update(orval) return rval
def get_monitoring_channels(self, model, data, **kwargs): self.get_data_specs(model)[0].validate(data) rval = OrderedDict() composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for i, cost in enumerate(self.costs): cost_data = nested_data[i] try: channels = cost.get_monitoring_channels(model, cost_data, **kwargs) rval.update(channels) except TypeError: print ( "SumOfCosts.get_monitoring_channels encountered " "TypeError while calling " + str(type(cost)) + ".get_monitoring_channels" ) raise value = cost.expr(model, cost_data, **kwargs) if value is not None: name = "" if hasattr(value, "name") and value.name is not None: name = "_" + value.name rval["term_" + str(i) + name] = value return rval
def get_gradients(self, model, data, ** kwargs): #print 'get_gradients' chain_start = theano.shared(numpy.zeros(shape=(self.chain_num, model.n_vis)), name=None, borrow=True) v_samples = chain_start for i in xrange(self.k): v_samples = model.gibbs_vhv(v_samples)[-1] chain_end = v_samples #print 'chain_end', chain_end.ndim chain_updates = {} chain_updates[chain_start] = chain_end pos_v = data #neg_v = self.get_neg_v(model) cost = -(- model.free_energy(pos_v).mean() + model.free_energy(chain_end).mean()) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[chain_end]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() updates.update(chain_updates) # manual added return gradients, updates
def get_monitoring_channels(self, model, data, ** kwargs): self.get_data_specs(model)[0].validate(data) rval = OrderedDict() composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for i, cost in enumerate(self.costs): cost_data = nested_data[i] try: channels = cost.get_monitoring_channels(model, cost_data, **kwargs) rval.update(channels) except TypeError: logger.error('SumOfCosts.get_monitoring_channels encountered ' 'TypeError while calling {0}' '.get_monitoring_channels'.format(type(cost))) raise value = cost.expr(model, cost_data, ** kwargs) if value is not None: name = '' if hasattr(value, 'name') and value.name is not None: name = '_' + value.name rval['term_' + str(i) + name] = value return rval
def get_lr_scalers(self): rval = OrderedDict() params = self.get_params() for layer in self.layers[:-1]: contrib = layer.get_lr_scalers() assert isinstance(contrib, OrderedDict) # No two layers can contend to scale a parameter assert not any([key in rval for key in contrib]) # Don't try to scale anything that's not a parameter assert all([key in params for key in contrib]) rval.update(contrib) for layer in self.layers[-1]: contrib = layer.get_lr_scalers() assert isinstance(contrib, OrderedDict) # No two layers can contend to scale a parameter assert not any([key in rval for key in contrib]) # Don't try to scale anything that's not a parameter assert all([key in params for key in contrib]) rval.update(contrib) assert all([isinstance(val, float) for val in rval.values()]) return rval
def get_monitoring_channels(self, model, X, Y=None, **kwargs): if Y is None and self.supervised: raise ValueError("no targets provided while some of the " + "costs in the sum are supervised costs") rval = OrderedDict() for i, cost in enumerate(self.costs): try: rval.update(cost.get_monitoring_channels( model, X, Y, **kwargs)) except TypeError: print 'SumOfCosts.get_monitoring_channels encountered TypeError while calling ' \ + str(type(cost))+'.get_monitoring_channels' raise Y_to_pass = Y if not cost.supervised: Y_to_pass = None value = cost(model, X, Y_to_pass, **kwargs) if value is not None: name = '' if hasattr(value, 'name') and value.name is not None: name = '_' + value.name rval['term_' + str(i) + name] = value return rval
def get_monitoring_channels(self, model, X, Y=None, ** kwargs): if Y is None and self.supervised: raise ValueError("no targets provided while some of the " + "costs in the sum are supervised costs") rval = OrderedDict() for i, cost in enumerate(self.costs): try: rval.update(cost.get_monitoring_channels(model, X, Y, **kwargs)) except TypeError: print 'SumOfCosts.get_monitoring_channels encountered TypeError while calling ' \ + str(type(cost))+'.get_monitoring_channels' raise Y_to_pass = Y if not cost.supervised: Y_to_pass = None value = cost(model, X, Y_to_pass, ** kwargs) if value is not None: name = '' if hasattr(value, 'name') and value.name is not None: name = '_' + value.name rval['term_'+str(i)+name] = value return rval
def get_gradients(self, model, data, ** kwargs): indiv_results = [] composite_specs, mapping = self.get_composite_specs_and_mapping(model) nested_data = mapping.nest(data) for cost, cost_data in safe_zip(self.costs, nested_data): result = cost.get_gradients(model, cost_data, ** kwargs) indiv_results.append(result) grads = OrderedDict() updates = OrderedDict() params = model.get_params() for coeff, packed in zip(self.coeffs, indiv_results): g, u = packed for param in g: if param not in params: raise ValueError("A shared variable (" + str(param) + ") that is not a parameter appeared " "a cost gradient dictionary.") for param in g: assert param.ndim == g[param].ndim v = coeff * g[param] if param not in grads: grads[param] = v else: grads[param] = grads[param] + v assert grads[param].ndim == param.ndim assert not any([state in updates for state in u]) assert not any([state in params for state in u]) updates.update(u) return grads, updates
def get_monitoring_channels_from_state(self, state, target=None): warnings.warn("Layer.get_monitoring_channels_from_state is " + \ "deprecated. Use get_layer_monitoring_channels " + \ "instead. Layer.get_monitoring_channels_from_state " + \ "will be removed on or after september 24th 2014", stacklevel=2) # channels that does not require state information W = self.W assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) rval = OrderedDict([ ('row_norms_min', row_norms.min()), ('row_norms_mean', row_norms.mean()), ('row_norms_max', row_norms.max()), ('col_norms_min', col_norms.min()), ('col_norms_mean', col_norms.mean()), ('col_norms_max', col_norms.max()), ]) mx = state.max(axis=1) rval.update( OrderedDict([('mean_max_class', mx.mean()), ('max_max_class', mx.max()), ('min_max_class', mx.min())])) if target is not None: y_hat = T.argmax(state, axis=1) y = T.argmax(target, axis=1) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=target) return rval
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): W, = self.transformer.get_params() assert W.ndim == 4 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=(1, 2, 3))) rval = OrderedDict([ ('kernel_norms_min', row_norms.min()), ('kernel_norms_mean', row_norms.mean()), ('kernel_norms_max', row_norms.max()), ]) orval = super(CorrMMElemwise, self).get_monitoring_channels_from_state(state, targets) rval.update(orval) cst = self.cost orval = self.nonlin.get_monitoring_channels_from_state(state, targets, cost_fn=cst) rval.update(orval) return rval
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=NotImplementedError): if self.no_affine: return OrderedDict() W_class = self.W_class W_cluster = self.W_cluster assert W_class.ndim == 3 assert W_cluster.ndim == 2 sq_W = T.sqr(W_cluster) sq_W_class = T.sqr(W_class) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) row_norms_class = T.sqrt(sq_W_class.sum(axis=1)) col_norms_class = T.sqrt(sq_W_class.sum(axis=0)) rval = OrderedDict([ ('row_norms_min' , row_norms.min()), ('row_norms_mean' , row_norms.mean()), ('row_norms_max' , row_norms.max()), ('col_norms_min' , col_norms.min()), ('col_norms_mean' , col_norms.mean()), ('col_norms_max' , col_norms.max()), ('class_row_norms_min' , row_norms_class.min()), ('class_row_norms_mean' , row_norms_class.mean()), ('class_row_norms_max' , row_norms_class.max()), ('class_col_norms_min' , col_norms_class.min()), ('class_col_norms_mean' , col_norms_class.mean()), ('class_col_norms_max' , col_norms_class.max()), ]) if (state_below is not None) or (state is not None): if state is None: #for value in get_debug_values(state_below): #print 'value is'+ value state=self.fprop (state_below,targets) #print state probclass, probcluster = state mx = probclass.max(axis=1) rval.update(OrderedDict([('mean_max_class',mx.mean()), ('max_max_class' , mx.max()), ('min_max_class' , mx.min()) ])) if targets is not None: rval['nll'] = self.cost(Y=targets,Y_hat=(probclass,probcluster)) rval['perplexity'] = 10 ** (rval['nll']/np.log(10).astype('float32')) rval['entropy'] = rval['nll']/np.log(2).astype('float32') return rval
def get_gradients(self, model, data, **kwargs): cost = self._cost(model, data, **kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs="ignore", consider_constant=[self.sampler.particles]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() sampler_updates = self.sampler.updates() updates.update(sampler_updates) return gradients, updates
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): rval = OrderedDict() S = T.exp(-self.D) rval.update( OrderedDict([ ('S_stddev', S.std()), ('S_mean', S.mean()), ('S_over_1_stdev', 1.0 * (S > (S.mean() + S.std())).sum()), ('S_over_2_stdev', 1.0 * (S > (S.mean() + 2 * S.std())).sum()) ])) return rval
def get_gradients(self, model, data, **kwargs): cost = self._cost(model,data,**kwargs) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant = [self.sampler.particles]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() sampler_updates = self.sampler.updates() updates.update(sampler_updates) return gradients, updates
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): # channels that does not require state information # if self.no_affine: # rval = OrderedDict() # # W = self.W # # assert W.ndim == 2 # # sq_W = T.sqr(W) # # row_norms = T.sqrt(sq_W.sum(axis=1)) # col_norms = T.sqrt(sq_W.sum(axis=0)) # # rval = OrderedDict([('row_norms_min', row_norms.min()), # ('row_norms_mean', row_norms.mean()), # ('row_norms_max', row_norms.max()), # ('col_norms_min', col_norms.min()), # ('col_norms_mean', col_norms.mean()), # ('col_norms_max', col_norms.max()), ]) rval = OrderedDict() if (state_below is not None) or (state is not None): if state is None: state = self.fprop(state_below) mx = state.max(axis=1) rval.update( OrderedDict([('mean_max_class', mx.mean()), ('max_max_class', mx.max()), ('min_max_class', mx.min())])) if targets is not None: y_hat = self.target_convert(T.argmax(state, axis=1)) #Assume target is in [0,1] as binary one-hot y = self.target_convert(T.argmax(targets, axis=1)) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=targets) return rval
def get_gradients(self, model, data, ** kwargs): cost_cd, cost_ci = model.cost_from_X(data) params_dict = model.get_params() params = list(params_dict) zero_grads = [] if self.zero_ci_grad_for_cd: #how to get this in less explicit way, i.e. using only dict? print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' assert model.layers[-1].M in params_dict assert model.layers[-1].m in params_dict zero_grads = [model.layers[-1].M, model.layers[-1].m] grads_cd = T.grad(cost_cd, params, disconnected_inputs = 'ignore', consider_constant=zero_grads) grads_ci = T.grad(cost_ci, params, disconnected_inputs = 'ignore') gradients_cd = OrderedDict(izip(params, grads_cd)) gradients_ci = OrderedDict(izip(params, grads_ci)) indiv_results = [] indiv_results.append((gradients_cd, OrderedDict())) indiv_results.append((gradients_ci, OrderedDict())) grads = OrderedDict() updates = OrderedDict() params = model.get_params() for coeff, packed in zip([self.coeff_cd, self.coeff_ci], indiv_results): g, u = packed for param in g: if param not in params: raise ValueError("A shared variable ("+str(param)+") that is not a parameter appeared in a cost gradient dictionary.") for param in g: assert param.ndim == g[param].ndim v = coeff * g[param] if param not in grads: grads[param] = v else: grads[param] = grads[param] + v assert grads[param].ndim == param.ndim assert not any([state in updates for state in u]) assert not any([state in params for state in u]) updates.update(u) return grads, updates
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): # channels that does not require state information # if self.no_affine: # rval = OrderedDict() # # W = self.W # # assert W.ndim == 2 # # sq_W = T.sqr(W) # # row_norms = T.sqrt(sq_W.sum(axis=1)) # col_norms = T.sqrt(sq_W.sum(axis=0)) # # rval = OrderedDict([('row_norms_min', row_norms.min()), # ('row_norms_mean', row_norms.mean()), # ('row_norms_max', row_norms.max()), # ('col_norms_min', col_norms.min()), # ('col_norms_mean', col_norms.mean()), # ('col_norms_max', col_norms.max()), ]) rval = OrderedDict() if (state_below is not None) or (state is not None): if state is None: state = self.fprop(state_below) mx = state.max(axis=1) rval.update(OrderedDict([ ('mean_max_class', mx.mean()), ('max_max_class', mx.max()), ('min_max_class', mx.min())])) if targets is not None: y_hat = self.target_convert(T.argmax(state, axis=1)) #Assume target is in [0,1] as binary one-hot y = self.target_convert(T.argmax(targets, axis=1)) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=targets) return rval
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): # channels that does not require state information if self.no_affine: rval = OrderedDict() W = self.W assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) rval = OrderedDict([('row_norms_min', row_norms.min()), ('row_norms_mean', row_norms.mean()), ('row_norms_max', row_norms.max()), ('col_norms_min', col_norms.min()), ('col_norms_mean', col_norms.mean()), ('col_norms_max', col_norms.max()), ]) if (state_below is not None) or (state is not None): if state is None: state = self.fprop(state_below) mx = state.max(axis=1) rval.update(OrderedDict([('mean_max_class', mx.mean()), ('max_max_class', mx.max()), ('min_max_class', mx.min())])) if targets is not None: y_hat = T.argmax(state, axis=1) y = T.argmax(targets, axis=1) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=targets) rval['perplexity'] = 2 ** (rval['nll'] / T.log(2)) return rval
def get_gradients(self, model, data, ** kwargs): #print 'get_gradients' pos_v = data [h_mean, h_sample, v_mean, v_sample], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, outputs_info=[None, None, None, pos_v], non_sequences=None, n_steps=self.k) neg_v = v_sample[-1] cost = -(- model.free_energy(pos_v).mean() + model.free_energy(neg_v).mean()) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, neg_v]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() updates.update(scan_updates) # add scan_updates return gradients, updates
def get_gradients(self, model, X, Y=None, **kwargs): if Y is None and self.supervised: raise ValueError("no targets provided while some of the " + "costs in the sum are supervised costs") indiv_results = [] for cost in self.costs: if cost.supervised: Y_to_pass = Y else: Y_to_pass = None result = cost.get_gradients(model, X, Y_to_pass, **kwargs) indiv_results.append(result) grads = OrderedDict() updates = OrderedDict() params = model.get_params() for coeff, packed in zip(self.coeffs, indiv_results): g, u = packed for param in g: if param not in params: raise ValueError( "A shared variable (" + str(param) + ") that is not a parameter appeared in a cost gradient dictionary." ) for param in g: assert param.ndim == g[param].ndim v = coeff * g[param] if param not in grads: grads[param] = v else: grads[param] = grads[param] + v assert grads[param].ndim == param.ndim assert not any([state in updates for state in u]) assert not any([state in params for state in u]) updates.update(u) return grads, updates
def get_gradients(self, model, X, Y=None, ** kwargs): if Y is None and self.supervised: raise ValueError("no targets provided while some of the " + "costs in the sum are supervised costs") indiv_results = [] for cost in self.costs: if cost.supervised: Y_to_pass = Y else: Y_to_pass = None result = cost.get_gradients(model, X, Y_to_pass, ** kwargs) indiv_results.append(result) grads = OrderedDict() updates = OrderedDict() params = model.get_params() for coeff, packed in zip(self.coeffs, indiv_results): g, u = packed for param in g: if param not in params: raise ValueError("A shared variable ("+str(param)+") that is not a parameter appeared in a cost gradient dictionary.") for param in g: assert param.ndim == g[param].ndim v = coeff * g[param] if param not in grads: grads[param] = v else: grads[param] = grads[param] + v assert grads[param].ndim == param.ndim assert not any([state in updates for state in u]) assert not any([state in params for state in u]) updates.update(u) return grads, updates
def get_gradients(self, model, data, ** kwargs): #print 'get_gradients' pos_v = data v_samples = pos_v [h_mean, h_samples, pool_mean, pool_samples, vis_mean, vis_samples], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, outputs_info=[None, None, None, None, None, v_samples], non_sequences=None, n_steps=self.k) pos_h = h_mean[0] neg_v = vis_samples[-1] neg_h = model.sample_hp_given_v(v=neg_v, sample=False)[0] cost = -(- model.energy(pos_v, pos_h).mean() + model.energy(neg_v, neg_h).mean()) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, pos_h, neg_v, neg_h]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() updates.update(scan_updates) # add scan_updates return gradients, updates
def get_gradients(self, model, data, ** kwargs): """cd算法是近似计算导数,而非直接求导,因此重写get_gradient()""" pos_v = data #v_samples = data [h_mean, h_sample, v_mean, v_sample], scan_updates = theano.scan(fn = model.gibbs_vhv, sequences=None, outputs_info=[None, None, None, pos_v], non_sequences=None, n_steps=self.k) pos_h = h_mean[0] neg_v = v_sample[-1] neg_h = model.propup(neg_v) cost = -(- model.energy(pos_v, pos_h).mean() + model.energy(neg_v, neg_h).mean()) params = list(model.get_params()) grads = T.grad(cost, params, disconnected_inputs = 'ignore', consider_constant=[pos_v, pos_h, neg_v, neg_h]) gradients = OrderedDict(izip(params, grads)) updates = OrderedDict() updates.update(scan_updates) # add scan_updates return gradients, updates
def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): # channels that does not require state information W = self.W assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) rval = OrderedDict([ ('row_norms_min', row_norms.min()), ('row_norms_mean', row_norms.mean()), ('row_norms_max', row_norms.max()), ('col_norms_min', col_norms.min()), ('col_norms_mean', col_norms.mean()), ('col_norms_max', col_norms.max()), ]) if (state_below is not None) or (state is not None): if state is None: state = self.fprop(state_below) mx = state.max(axis=1) rval.update( OrderedDict([('mean_max_class', mx.mean()), ('max_max_class', mx.max()), ('min_max_class', mx.min())])) if targets is not None: y_hat = T.argmax(state, axis=1) y = T.argmax(targets, axis=1) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=targets) return rval
def _get_givens_subset(self, subset, batch_slice): """ This translates a batch slice of start and end indices into the actual data from the given subset. Parameters ---------- subset : int The subset to use - determined in opendeep.data.datasets as TRAIN, VALID, or TEST attributes. batch_slice : symbolic slice The symbolic slice to grab from the data. Returns ------- OrderedDict The givens to provide to a function where it sets the input variable to the actual batch representation of data from the dataset: (input_variable: data[batch]) """ # translate the data_idx into the givens for the model # first get the lists of input variables the model requires - inputs and targets model_inputs = raise_to_list(self.model.get_inputs()) model_targets = raise_to_list(self.model.get_targets()) givens = None if self.dataset.getSubset(subset)[0] is not None: # grab the data and labels data, labels = self.dataset.getSubset(subset) # create the givens for the input function as pairs of (input_variable: sliced_data) givens = OrderedDict(zip(model_inputs, [data[batch_slice]])) # include labels as well if they are required by the model if model_targets is not None and len(model_targets) > 0: if labels is None: log.error("No labels in the dataset!") raise AssertionError, "No lables in the dataset!" givens.update(OrderedDict(zip(model_targets, [labels[batch_slice]]))) else: log.warning("Dataset doesn't have subset %s" % get_subset_strings(subset)) return givens
class SGD_Optimizer(): def __init__(self,params,inputs,costs,updates_old=None,consider_constant=[],momentum=True): """ params: parameters of the model inputs: list of symbolic inputs to the graph costs: list of costs to be evaluated. The first element MUST be the objective. updates_old: OrderedDict from previous graphs that need to be accounted for by SGD, typically when scan is used. consider_constant: list of theano variables that are passed on to the grad method. Typically RBM. """ self.inputs = inputs self.params = params self.momentum = momentum if self.momentum: self.params_mom = [] for param in self.params: param_init = theano.shared(value=numpy.zeros(param.get_value().shape,dtype=theano.config.floatX),name=param.name+'_mom') self.params_mom.append(param_init) self.costs = costs self.num_costs = len(costs) assert (isinstance(costs,list)), "The costs given to the SGD class must be a list, even for one element." self.updates_old = updates_old self.consider_constant = consider_constant self.build_train_fn() def build_train_fn(self,): self.lr_theano = T.scalar('lr') self.grad_inputs = self.inputs + [self.lr_theano] if self.momentum: self.mom_theano = T.scalar('mom') self.grad_inputs = self.grad_inputs + [self.mom_theano] self.gparams = T.grad(self.costs[0],self.params,consider_constant=self.consider_constant) if not self.momentum: print 'Building SGD optimization graph without momentum' updates = OrderedDict((i, i - self.lr_theano*j) for i, j in zip(self.params, self.gparams)) else: print 'Building SGD optimization graph with momentum' updates = OrderedDict() for param,param_mom,gparam in zip(self.params,self.params_mom,self.gparams): param_inc = self.mom_theano * param_mom - self.lr_theano * gparam updates[param_mom] = param_inc updates[param] = param + param_inc self.calc_cost = theano.function(self.inputs,self.costs) if self.updates_old: updates_old = copy.copy(updates_old) #To avoid updating the model dict if updates dict belongs to model class, very unlikely case. self.updates_old.update(updates) else: self.updates_old = OrderedDict() self.updates_old.update(updates) self.f = theano.function(self.grad_inputs, self.costs, updates=self.updates_old) def train(self,train_set,valid_set=None,learning_rate=0.1,num_epochs=500,save=False,output_folder=None,lr_update=None,mom_rate=0.9): self.best_cost = numpy.inf self.init_lr = learning_rate self.lr = numpy.array(learning_rate) self.mom_rate = mom_rate self.output_folder = output_folder self.train_set = train_set self.valid_set = valid_set self.save = save self.lr_update = lr_update try: for u in xrange(num_epochs): cost = [] for i in self.train_set.iterate(True): inputs = i + [self.lr] if self.momentum: inputs = inputs + [self.mom_rate] cost.append(self.f(*inputs)) mean_costs = numpy.mean(cost,axis=0) print ' Epoch %i ' %(u+1) print '***Train Results***' for i in xrange(self.num_costs): print "Cost %i: %f"%(i,mean_costs[i]) if not valid_set: this_cost = numpy.absolute(numpy.mean(cost, axis=0)) if this_cost < best_cost: best_cost = this_cost print 'Best Params!' if save: self.save_model() sys.stdout.flush() else: self.perform_validation() if lr_update: self.update_lr(u+1,begin_anneal=1) except KeyboardInterrupt: print 'Training interrupted.' def perform_validation(self,): cost = [] for i in self.valid_set.iterate(True): cost.append(self.calc_cost(*i)) mean_costs = numpy.mean(cost,axis=0) print '***Validation Results***' for i in xrange(self.num_costs): print "Cost %i: %f"%(i,mean_costs[i]) this_cost = numpy.absolute(numpy.mean(cost, axis=0))[1] #Using accuracy as metric if this_cost < self.best_cost: self.best_cost = this_cost print 'Best Params!' if self.save: self.save_model() def save_model(self,): best_params = [param.get_value().copy() for param in self.params] if not self.output_folder: cPickle.dump(best_params,open('best_params.pickle','w')) else: if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) save_path = os.path.join(self.output_folder,'best_params.pickle') cPickle.dump(best_params,open(save_path,'w')) def update_lr(self,count,update_type='annealed',begin_anneal=500.,min_lr=0.01,decay_factor=1.2): if update_type=='annealed': scale_factor = float(begin_anneal)/count self.lr = self.init_lr*min(1.,scale_factor) if update_type=='exponential': new_lr = float(self.init_lr)/(decay_factor**count) if new_lr < min_lr: self.lr = min_lr else: self.lr = new_lr
def setup_training(self): """ Sets up training function. """ training_batch_size = self.mini_batch_size cost = self.cnn.get_default_cost() data_specs = cost.get_data_specs(self.cnn) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) #theano_args contains information about the shape of each layer theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=training_batch_size).astype("float32") theano_args.append(arg) theano_args = tuple(theano_args) y_hat = self.cnn.fprop(theano_args[0]) #function used for faster fprop self.fprop_func = theano.function([theano_args[0]], y_hat) cost = self.cnn.cost(theano_args[1], y_hat) #params is the list of layers in the NN params = list(self.cnn.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) rms_vals_dict = OrderedDict(izip(params, self.rms_vals)) momentum_vals_dict = OrderedDict(izip(params, self.momentum_vals)) grad_vals_dict = OrderedDict(izip(params, self.grad_vals)) grad_update = OrderedDict() grad_update.update(dict(safe_zip(self.grad_vals, [gradients[param] for param in params]))) #function used for getting gradients #this is so that we only calculate gradients once, then #the same values are used for updating momentum, rmsprop, and training self.grad_update_func = theano.function(theano_args, updates=grad_update, on_unused_input='ignore') updates = OrderedDict() updates.update(dict(safe_zip(params, [param - self.learning_rate * (grad_vals_dict[param] / T.sqrt(rms_vals_dict[param] + 1e-8)) + (self.momentum_step_size * momentum_vals_dict[param]) for param in params]))) rmsprop_updates = OrderedDict() #rmsprop update function rmsprop_updates.update(dict(safe_zip(self.rms_vals, [(rms_vals_dict[param] * .9) + (T.sqr(grad_vals_dict[param]) * .1) for param in params]))) self.training = theano.function([], updates=updates, on_unused_input='ignore') self.rmsprop_update = theano.function([], updates=rmsprop_updates, on_unused_input='ignore') momentum_updates = OrderedDict() #momentum update function momentum_updates.update(dict(safe_zip(self.momentum_vals, [-self.learning_rate * (grad_vals_dict[param] / T.sqrt(rms_vals_dict[param] + 1e-8)) + (self.momentum_step_size * momentum_vals_dict[param]) for param in params]))) self.momentum_update = theano.function([], updates=momentum_updates, on_unused_input='ignore') temp = T.tensor4() #function used for shuffling dimensions into c01b format self.dimshuf_func = theano.function([temp], temp.dimshuffle(1, 2, 3, 0)) #functions to get grads and costs for debugging self.grads_func = theano.function(theano_args, grads) self.cost_function = theano.function(theano_args, cost)
def __init__(self, objective, params, inputs = None, param_constrainers = None, max_iter = -1, lr_scalers = None, verbose = 0, tol = None, init_alpha = None, min_init_alpha = 1e-3, reset_alpha = True, conjugate = False, reset_conjugate = True, gradients = None, gradient_updates = None, line_search_mode = None, accumulate = False, theano_function_mode=None): """ Parameters ---------- objective : tensor_like A theano expression to be minimized should be a function of \ params and, if provided, inputs params : list A list of theano shared variables. These are the optimization \ variables inputs : list, optional A list of theano variables to serve as inputs to the graph. param_constrainers : list A list of callables to be called on all updates dictionaries to \ be applied to params. This is how you implement constrained \ optimization. reset_alpha : bool If True, reverts to using init_alpha after each call. If False, \ the final set of alphas is used at the start of the next call to \ minimize. conjugate : bool If True, tries to pick conjugate gradient directions. For the \ directions to be truly conjugate, you must use line_search_mode = \ 'exhaustive' and the objective function must be quadratic. \ Using line_search_mode = 'exhaustive' on a non-quadratic \ objective function implements nonlinear conjugate gradient descent. reset_conjugate : bool Has no effect unless conjugate == True. If reset_conjugate == \ True, reverts to direction of steepest descent for the first \ step in each call to minimize. Otherwise, tries to make the new \ search direction conjugate to the last one (even though the \ objective function might be totally different on each call to \ minimize) gradients : WRITEME If None, compute the gradients of obj using T.grad otherwise, a \ dictionary mapping from params to expressions for their gradients \ (this allows you to use approximate gradients computed with \ something other than T.grad) gradient_updates : dict A dictionary of shared variable updates to run each time the \ gradient is computed Notes ----- Calling the ``minimize'' method with values for for ``inputs'' will update ``params'' to minimize ``objective''. """ self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [ param for param in params ] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX( param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: print 'batch gradient class compiling gradient function' t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates = updates) else: self._compute_grad = function(inputs, updates = updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() print 'done. Took ',t2-t1 if self.verbose: print 'batch gradient class compiling objective function' if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: print 'done' self.param_to_cache = OrderedDict() alpha = T.scalar(name = 'alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g], g * norm) for g in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [ grad_to_old_grad[g] for g in grad_ordered] def dot_product(x, y): return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) """ beta_pr is the Polak-Ribiere formula for beta. According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste" but max(0, beta_pr) is "a popular choice... which provides direction reset automatically." (ie, it is meant to revert to steepest descent when you have traveled far enough that the objective function is behaving non-quadratically enough that the conjugate gradient formulas aren't working anymore) http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method """ assert grad not in grad_to_old_grad make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \ + var_descriptor(v) + '\n') mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \ + var_descriptor(u) + '\n') self._make_conjugate = function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \ + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def train(self, continue_training=False): """ This method performs the training!!! :param continue_training: :type continue_training: :return: :rtype: """ # grab the model parameters to use during training self.params = self.model.get_params() log.info("%s params: %s", str(type(self.model)), str(self.params)) ############################################### # theano index variable to use on the dataset # ############################################### # index to a [mini]batch - both start and end data_idx = T.iscalar('data_index') data_end_idx = T.iscalar('data_end_index') batch_slice = slice(data_idx, data_end_idx) # compute number of minibatches for training, validation and testing # shapes is list of list - input list of datasets to optimizer (for multiple inputs), and each dataset # could be a list of shared variables (like multiple sequences from files) train_data_shapes = raise_to_list(self.dataset.getDataShape(TRAIN)) valid_data_shapes = raise_to_list(self.dataset.getDataShape(VALID)) test_data_shapes = raise_to_list(self.dataset.getDataShape(TEST)) # train_batches is going to be lists of tuples that contain the start and end indices for train data train_data_lens = [shape[0] for shape in train_data_shapes] self.train_batches = self.get_batch_indices(train_data_lens) if valid_data_shapes is not None: valid_data_lens = [shape[0] for shape in valid_data_shapes] self.valid_batches = self.get_batch_indices(valid_data_lens) else: self.valid_batches = None if test_data_shapes is not None: test_data_lens = [shape[0] for shape in test_data_shapes] self.test_batches = self.get_batch_indices(test_data_lens) else: self.test_batches = None # translate the data_idx into the givens for the model model_inputs = raise_to_list(self.model.get_inputs()) model_targets = raise_to_list(self.model.get_targets()) train_data, train_labels = self.dataset.getSubset(TRAIN) train_givens = OrderedDict(zip(model_inputs, [train_data[batch_slice]])) if model_targets is not None and len(model_targets) > 0: train_givens.update(OrderedDict(zip(model_targets, [train_labels[batch_slice]]))) valid_data, valid_labels = self.dataset.getSubset(VALID) valid_givens = OrderedDict(zip(model_inputs, [valid_data[batch_slice]])) if model_targets is not None and len(model_targets) > 0: valid_givens.update(OrderedDict(zip(model_targets, [valid_labels[batch_slice]]))) test_data, test_labels = self.dataset.getSubset(TEST) test_givens = OrderedDict(zip(model_inputs, [test_data[batch_slice]])) if model_targets is not None and len(model_targets) > 0: test_givens.update(OrderedDict(zip(model_targets, [test_labels[batch_slice]]))) # Now time to create the training cost functions for the model - make sure to handle the possible # list of costs used for pretraining of certain parts of the model. train_costs = raise_to_list(self.model.get_train_cost()) self.train_functions = [] for i, train_cost in enumerate(train_costs): # Now create the training cost function for the model to use while training - update parameters # gradient! gradients, _ = self.model.get_gradient(cost=train_cost) # Calculate the optimizer updates each run # This is where the magic happens for a lot of sub-implementations of SGD, including AdaDelta! # It tells how to update the params each training epoch gradient_updates = self.get_updates(gradients) # Combine the updates from the model also if applicable train_updates = self.model.get_updates() if train_updates: train_updates.update(gradient_updates) else: train_updates = gradient_updates # Compile the training function! log.info('Compiling f_learn %d/%d function for model %s...', i + 1, len(train_costs), str(type(self.model))) t = time.time() f_learn = function(inputs=[data_idx, data_end_idx], updates=train_updates, outputs=train_cost, givens=train_givens, name='f_learn_%d' % i) log.info('f_learn compilation took %s', make_time_units_string(time.time() - t)) self.train_functions.append(f_learn) # grab the expression(s) to use to monitor different model values during training log.debug("Compiling monitor functions...") monitor_t = time.time() self.monitors = OrderedDict(self.model.get_monitors()) self.monitor_names = self.monitors.keys() if len(self.monitors.keys()) > 0: self.train_monitor_function = function( inputs=[data_idx, data_end_idx], updates=self.model.get_updates(), outputs=self.monitors.values(), givens=train_givens, name="train_monitor_function" ) if len(self.monitors.keys()) > 0: self.valid_monitor_function = function( inputs=[data_idx, data_end_idx], updates=self.model.get_updates(), outputs=self.monitors.values(), givens=valid_givens, name="valid_monitor_function" ) if len(self.monitors.keys()) > 0: self.test_monitor_function = function( inputs=[data_idx, data_end_idx], updates=self.model.get_updates(), outputs=self.monitors.values(), givens=test_givens, name="test_monitor_function" ) log.debug("Compilation done. Took %s", make_time_units_string(time.time() - monitor_t)) self.noise_switches = raise_to_list(self.model.get_noise_switch()) ################## # start training # ################## # make sure to deal with a list of train_cost functions - for layer-wise pretraining! # this list of training functions was created during __init__() start_time = time.time() for func_i, train_function in enumerate(self.train_functions): log.info("-----------TRAINING %s function %d/%d FOR %d EPOCHS (continue_training=%s)-----------", str(type(self.model)), func_i + 1, len(self.train_functions), self.n_epoch, str(continue_training)) log.debug("Train dataset size is: %s", self.dataset.getDataShape(TRAIN)) if self.dataset.hasSubset(VALID): log.debug("Valid dataset size is: %s", self.dataset.getDataShape(VALID)) if self.dataset.hasSubset(TEST): log.debug("Test dataset size is: %s", self.dataset.getDataShape(TEST)) self.STOP = False self.epoch_counter = 0 if not continue_training: # reset the learning rate if hasattr(self, 'learning_rate_decay') and self.learning_rate_decay: self.learning_rate_decay.reset() # reset the other model decaying functions for decay_param in self.model.get_decay_params(): decay_param.reset() self.times = [] self.best_cost = numpy.inf self.best_params = None self.patience = 0 t = time.time() while not self.STOP: try: self.STOP = self._perform_one_epoch(train_function) except KeyboardInterrupt: log.info("STOPPING EARLY FROM KEYBOARDINTERRUPT") self.STOP = True # save params if self.best_params is not None: log.debug("Restoring best model parameters...") set_shared_values(self.params, self.best_params) log.debug("Saving model parameters...") self.model.save_params('trained_epoch_' + str(self.epoch_counter) + '.pkl') log.info("------------TRAIN TIME TOOK %s---------", make_time_units_string(time.time() - t)) log.info("------------TOTAL %s TRAIN TIME TOOK %s---------", str(type(self.model)), make_time_units_string(time.time() - start_time))
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): """ objective: a theano expression to be minimized should be a function of params and, if provided, inputs params: A list of theano shared variables. These are the optimization variables inputs: (Optional) A list of theano variables to serve as inputs to the graph. param_constrainers: (Optional) A list of callables to be called on all updates dictionaries to be applied to params. This is how you implement constrained optimization. reset_alpha: If True, reverts to using init_alpha after each call. If False, the final set of alphas is used at the start of the next call to minimize. conjugate: If True, tries to pick conjugate gradient directions. For the directions to be truly conjugate, you must use line_search_mode = 'exhaustive' and the objective function must be quadratic. Using line_search_mode = 'exhaustive' on a non-quadratic objective function implements nonlinear conjugate gradient descent. reset_conjugate: has no effect unless conjugate == True if reset_conjugate == True, reverts to direction of steepest descent for the first step in each call to minimize. otherwise, tries to make the new search direction conjugate to the last one (even though the objective function might be totally different on each call to minimize) gradients: if None, compute the gradients of obj using T.grad otherwise, a dictionary mapping from params to expressions for their gradients (this allows you to use approximate gradients computed with something other than T.grad) gradient_updates: a dictionary of shared variable updates to run each time the gradient is computed Calling the ``minimize'' method with values for for ``inputs'' will update ``params'' to minimize ``objective''. """ self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: print 'batch gradient class compiling gradient function' t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() print 'done. Took ', t2 - t1 if self.verbose: print 'batch gradient class compiling objective function' if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: print 'done' self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt( sum([ T.sqr(elem).sum() for elem in self.param_to_grad_shared.values() ])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + ( 1. - self.new_weight) * self.ave_grad_size self._normalize_grad = function( [], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_' + elem.name) self._store_old_grad = function( [norm], updates=OrderedDict([(grad_to_old_grad[g], g * norm) for g in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g] for g in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) """ beta_pr is the Polak-Ribiere formula for beta. According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste" but max(0, beta_pr) is "a popular choice... which provides direction reset automatically." (ie, it is meant to revert to steepest descent when you have traveled far enough that the objective function is behaving non-quadratically enough that the conjugate gradient formulas aren't working anymore) http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method """ assert grad not in grad_to_old_grad make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \ + var_descriptor(v) + '\n') mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \ + var_descriptor(u) + '\n') self._make_conjugate = function( [], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \ + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
class Recurrent(Layer): """ A recurrent neural network layer using the hyperbolic tangent activation function, passing on all hidden states or a selection of them to the next layer. The hidden state is initialized to zeros. Parameters ---------- dim : int The number of elements in the hidden layer layer_name : str The name of the layer. All layers in an MLP must have a unique name. irange : float Initializes each weight randomly in U(-irange, irange) irange : float The input-to-hidden weight matrix is initialized with weights in the uniform interval (-irange, irange). The hidden-to-hidden matrix weights are sampled in the same manner, unless the argument svd is set to True (see below). indices : slice, list of integers or integer, optional If specified this layer will return only the given hidden states. If an integer is given, it will not return a SequenceSpace. Otherwise, it will return a SequenceSpace of fixed length. Note that a SequenceSpace of fixed length can be flattened by using the FlattenerLayer. Note: For now only [-1] is supported. init_bias : float, optional Set an initial bias to be added at each time step. Defaults to 0. svd : bool, optional Use singular value decomposition to factorize the hidden-to-hidden transition matrix with weights in U(-irange, irange) into matrices U*s*V, where U is orthogonal. This orthogonal matrix is used to initialize the weight matrix. Defaults to True. nonlinearity : theano function, optional Defaults to tensor.tanh, the non-linearity to be applied to the hidden state after each update """ def __init__(self, dim, layer_name, irange, indices=None, init_bias=0., svd=True, nonlinearity=tensor.tanh): self.rnn_friendly = True self._scan_updates = OrderedDict() self.__dict__.update(locals()) del self.self super(Recurrent, self).__init__() @wraps(Layer.set_input_space) def set_input_space(self, space): if (not isinstance(space, SequenceSpace) or not isinstance(space.space, VectorSpace)): raise ValueError("Recurrent layer needs a SequenceSpace(" "VectorSpace) as input but received %s instead" % (space)) self.input_space = space if self.indices is not None: if len(self.indices) > 1: raise ValueError("Only indices = [-1] is supported right now") self.output_space = CompositeSpace( [VectorSpace(dim=self.dim) for _ in range(len(self.indices))] ) else: assert self.indices == [-1], "Only indices = [-1] works now" self.output_space = VectorSpace(dim=self.dim) else: self.output_space = SequenceSpace(VectorSpace(dim=self.dim)) # Initialize the parameters rng = self.mlp.rng if self.irange is None: raise ValueError("Recurrent layer requires an irange value in " "order to initialize its weight matrices") # U is the hidden-to-hidden transition matrix U = rng.uniform(-self.irange, self.irange, (self.dim, self.dim)) if self.svd: U = self.mlp.rng.randn(self.dim, self.dim) U, s, V = np.linalg.svd(U, full_matrices=True, compute_uv=True) # W is the input-to-hidden matrix W = rng.uniform(-self.irange, self.irange, (self.input_space.dim, self.dim)) self._params = [sharedX(W, name=(self.layer_name + '_W')), sharedX(U, name=(self.layer_name + '_U')), sharedX(np.zeros(self.dim) + self.init_bias, name=self.layer_name + '_b')] @wraps(Layer.get_layer_monitoring_channels) def get_layer_monitoring_channels(self, state_below=None, state=None, targets=None): W, U, b = self._params sq_W = tensor.sqr(W) sq_U = tensor.sqr(U) row_norms = tensor.sqrt(sq_W.sum(axis=1)) col_norms = tensor.sqrt(sq_W.sum(axis=0)) u_row_norms = tensor.sqrt(sq_U.sum(axis=1)) u_col_norms = tensor.sqrt(sq_U.sum(axis=0)) rval = OrderedDict([('W_row_norms_min', row_norms.min()), ('W_row_norms_mean', row_norms.mean()), ('W_row_norms_max', row_norms.max()), ('W_col_norms_min', col_norms.min()), ('W_col_norms_mean', col_norms.mean()), ('W_col_norms_max', col_norms.max()), ('U_row_norms_min', u_row_norms.min()), ('U_row_norms_mean', u_row_norms.mean()), ('U_row_norms_max', u_row_norms.max()), ('U_col_norms_min', u_col_norms.min()), ('U_col_norms_mean', u_col_norms.mean()), ('U_col_norms_max', u_col_norms.max())]) if (state is not None) or (state_below is not None): if state is None: state = self.fprop(state_below) state, _ = state state_below, _ = state_below mx = state.max(axis=0) mean = state.mean(axis=0) mn = state.min(axis=0) rg = mx - mn rval['range_x_max_u'] = rg.max() rval['range_x_mean_u'] = rg.mean() rval['range_x_min_u'] = rg.min() rval['max_x_max_u'] = mx.max() rval['max_x_mean_u'] = mx.mean() rval['max_x_min_u'] = mx.min() rval['mean_x_max_u'] = mean.max() rval['mean_x_mean_u'] = mean.mean() rval['mean_x_min_u'] = mean.min() rval['min_x_max_u'] = mn.max() rval['min_x_mean_u'] = mn.mean() rval['min_x_min_u'] = mn.min() return rval @wraps(Layer._modify_updates) def _modify_updates(self, updates): # When random variables are used in the scan function the updates # dictionary returned by scan might not be empty, and needs to be # added to the updates dictionary before compiling the training # function if any(key in updates for key in self._scan_updates): # Don't think this is possible, but let's check anyway raise ValueError("A single shared variable is being updated by " "multiple scan functions") updates.update(self._scan_updates) @wraps(Layer.fprop) def fprop(self, state_below): state_below, mask = state_below # z0 is the initial hidden state which is (batch size, output dim) z0 = tensor.alloc(np.cast[config.floatX](0), state_below.shape[1], self.dim) if self.dim == 1: # This should fix the bug described in Theano issue #1772 z0 = tensor.unbroadcast(z0, 1) # Later we will add a noise function W, U, b = self._params # It is faster to do the input-to-hidden matrix multiplications # outside of scan state_below = tensor.dot(state_below, W) + b def fprop_step(state_below, mask, state_before, U): z = self.nonlinearity(state_below + tensor.dot(state_before, U)) # Only update the state for non-masked data, otherwise # just carry on the previous state until the end z = mask[:, None] * z + (1 - mask[:, None]) * state_before return z z, updates = scan(fn=fprop_step, sequences=[state_below, mask], outputs_info=[z0], non_sequences=[U]) self._scan_updates.update(updates) if self.indices is not None: if len(self.indices) > 1: return [z[i] for i in self.indices] else: return z[self.indices[0]] else: return (z, mask)
class SGD_Optimiser: def __init__(self,params,inputs,costs,updates_old=None,consider_constant=[],momentum=False,patience=20,custom_grads=False,custom_grad_dict=None): """ params: list containing the parameters of the model inputs: list of symbolic inputs to the graph costs: list of costs to be evaluated. The first element MUST be the objective. updates_old: OrderedDict from previous graphs that need to be accounted for by SGD, typically when scan is used. consider_constant: list of theano variables that are passed on to the grad method. Typically RBM. """ self.inputs = inputs self.params = params self.momentum = momentum self.max_patience = patience self.patience = 0 if self.momentum: self.params_mom = [] for param in self.params: param_init = theano.shared(value=numpy.zeros(param.get_value().shape,dtype=theano.config.floatX),) self.params_mom.append(param_init) self.costs = costs self.custom_grads = custom_grads self.custom_grad_dict = custom_grad_dict self.num_costs = len(costs) assert (isinstance(costs,list)), "The costs given to the SGD class must be a list, even for one element." self.updates_old = updates_old self.consider_constant = consider_constant self.build_train_fn() def build_train_fn(self,): self.lr_theano = T.scalar('lr') self.grad_inputs = self.inputs + [self.lr_theano] if self.momentum: self.mom_theano = T.scalar('mom') self.grad_inputs = self.grad_inputs + [self.mom_theano] if self.custom_grads: self.gparams = [] for param in self.params: self.gparams.append(self.custom_grad_dict[param.name]) else: self.gparams = T.grad(self.costs[0],self.params,consider_constant=self.consider_constant) if not self.momentum: print 'Building SGD optimization graph without momentum' updates = OrderedDict((i, i - self.lr_theano*j) for i, j in zip(self.params, self.gparams)) else: print 'Building SGD optimization graph with momentum' updates = OrderedDict() for param,param_mom,gparam in zip(self.params,self.params_mom,self.gparams): param_inc = self.mom_theano * param_mom - self.lr_theano * gparam updates[param_mom] = param_inc updates[param] = param + param_inc self.calc_cost = theano.function(self.inputs,self.costs) if self.updates_old: updates_old = copy.copy(updates_old) #To avoid updating the model dict if updates dict belongs to model class, very unlikely case. self.updates_old.update(updates) else: self.updates_old = OrderedDict() self.updates_old.update(updates) self.f = theano.function(self.grad_inputs, self.costs, updates=self.updates_old) def train(self,train_set,valid_set=None,learning_rate=0.1,num_epochs=500,save=False,output_folder=None,lr_update=True, mom_rate=0.9,update_type='linear',begin_anneal=50,start=2): print 'Initializing training.' self.best_cost = numpy.inf self.init_lr = learning_rate self.lr = numpy.array(learning_rate) self.mom_rate = mom_rate self.output_folder = output_folder self.train_set = train_set self.valid_set = valid_set self.save = save self.lr_update = lr_update self.stop_train = False self.train_costs = [] self.valid_costs = [] self.num_epochs = num_epochs self.start = start - 1 #subtracting one for zero index. try: for u in xrange(num_epochs): cost = [] for i in self.train_set.iterate(True): inputs = i + [self.lr] if self.momentum: inputs = inputs + [self.mom_rate] cost_no_update = self.calc_cost(*i) #print cost_no_update if numpy.isnan(cost_no_update): print 'Cost was NaN for a particular batch!' break else: cost.append(self.f(*inputs)) if numpy.isnan(cost_no_update): #pdb.set_trace() epochs = [i for i in xrange(len(self.train_costs))] costs = numpy.array(self.train_costs).reshape(-1) plot(epochs,costs) xlabel('epoch') ylabel('negative log-likelihood') title('Training on red wine dataset') if self.custom_grads: savefig('cost_custom.png') else: savefig('cost_theano.png') break mean_costs = numpy.mean(cost,axis=0) if numpy.isnan(mean_costs[0]): print 'Training cost is NaN.' print 'Breaking from training early, the last saved set of parameters is still usable!' break print ' Epoch %i ' %(u+1) print '***Train Results***' for i in xrange(self.num_costs): print "Cost %i: %f"%(i,mean_costs[i]) self.train_costs.append(mean_costs) if not valid_set: this_cost = numpy.absolute(numpy.mean(cost, axis=0)) if this_cost < self.best_cost: self.best_cost = this_cost print 'Best Params!' if save: self.save_model() sys.stdout.flush() else: self.perform_validation() if self.stop_train: print 'Stopping training early.' break if lr_update: self.update_lr(u+1,update_type='linear',start=self.start,num_iterations=self.num_epochs) print 'Training completed!' epochs = [i for i in xrange(len(self.train_costs))] costs = numpy.array(self.train_costs).reshape(-1) plot(epochs,costs) xlabel('epoch') ylabel('negative log-likelihood') title('Training on red wine dataset') except KeyboardInterrupt: print 'Training interrupted.' def perform_validation(self,): cost = [] for i in self.valid_set.iterate(True): cost.append(self.calc_cost(*i)) mean_costs = numpy.mean(cost,axis=0) self.valid_costs.append(mean_costs) print '***Validation Results***' for i in xrange(self.num_costs): print "Cost %i: %f"%(i,mean_costs[i]) this_cost = numpy.absolute(numpy.mean(cost, axis=0))[0] #Using accuracy as metric if this_cost < self.best_cost: self.patience = 0 self.best_cost = this_cost print 'Best Params!' if self.save: self.save_model() else: self.patience+=1 print 'Patience: %d/%d'%(self.patience,self.max_patience) if self.patience >= self.max_patience: self.stop_train = True def save_model(self,): best_params = [param.get_value().copy() for param in self.params] if not self.output_folder: cPickle.dump(best_params,open('best_params.pickle','w')) else: if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) save_path = os.path.join(self.output_folder,'best_params.pickle') cPickle.dump(best_params,open(save_path,'w')) def update_lr(self,count,update_type='annealed',begin_anneal=500.,min_lr=0.01,decay_factor=1.2,start=2,num_iterations=1000): if update_type=='annealed': scale_factor = float(begin_anneal)/count self.lr = self.init_lr*min(1.,scale_factor) elif update_type=='exponential': new_lr = float(self.init_lr)/(decay_factor**count) if new_lr < min_lr: self.lr = min_lr else: self.lr = new_lr elif update_type == 'linear': slope = self.init_lr/(num_iterations - start) if count >= start: self.lr = self.init_lr - count * slope print 'Updated lr: ',self.lr
class SGD_Optimiser: def __init__(self, params, inputs, costs, updates_old=None, consider_constant=[], momentum=False, patience=20, custom_grads=False, custom_grad_dict=None, state=None, clip_gradients=False, grad_threshold=50.): """ params: list containing the parameters of the model inputs: list of symbolic inputs to the graph costs: list of costs to be evaluated. The first element MUST be the objective. updates_old: OrderedDict from previous graphs that need to be accounted for by SGD, typically when scan is used. consider_constant: list of theano variables that are passed on to the grad method. Typically RBM. """ self.inputs = inputs self.params = params self.momentum = momentum self.max_patience = patience self.patience = 0 if self.momentum: self.params_mom = [] for param in self.params: param_init = theano.shared(value=numpy.zeros( param.get_value().shape, dtype=theano.config.floatX), ) self.params_mom.append(param_init) self.costs = costs self.custom_grads = custom_grads self.custom_grad_dict = custom_grad_dict self.num_costs = len(costs) assert ( isinstance(costs, list) ), "The costs given to the SGD class must be a list, even for one element." self.updates_old = updates_old self.consider_constant = consider_constant self.clip_gradients = clip_gradients self.grad_threshold = grad_threshold self.build_train_fn() #self.save_model() #saving pre-trained model self.state = state def build_train_fn(self, ): self.lr_theano = T.scalar('lr') self.grad_inputs = self.inputs + [self.lr_theano] if self.momentum: self.mom_theano = T.scalar('mom') self.grad_inputs = self.grad_inputs + [self.mom_theano] print 'Calculating gradients. This might take a while depending on the model...' if self.custom_grads: self.gparams = [] for param in self.params: self.gparams.append(self.custom_grad_dict[param.name]) else: if self.clip_gradients: self.gradient_clipping() else: self.gparams = T.grad(self.costs[0], self.params, consider_constant=self.consider_constant) print 'Done calculating gradients.' if not self.momentum: print 'Building SGD optimization graph without momentum' updates = OrderedDict((i, i - self.lr_theano * j) for i, j in zip(self.params, self.gparams)) else: print 'Building SGD optimization graph with momentum' updates = OrderedDict() for param, param_mom, gparam in zip(self.params, self.params_mom, self.gparams): param_inc = self.mom_theano * param_mom - self.lr_theano * gparam updates[param_mom] = param_inc updates[param] = param + param_inc self.calc_cost = theano.function(self.inputs, self.costs) if self.updates_old: updates_old = copy.copy( updates_old ) #To avoid updating the model dict if updates dict belongs to model class, very unlikely case. self.updates_old.update(updates) else: self.updates_old = OrderedDict() self.updates_old.update(updates) self.f = theano.function(self.grad_inputs, self.costs, updates=self.updates_old) def gradient_clipping(self, threshold=1.): print 'Including Gradient clipping' gparams = T.grad(self.costs[0], self.params, consider_constant=self.consider_constant) self.gparams = [] for gparam in gparams: norm_gparam = T.sqrt((gparam**2).sum()) clipped_gparam = T.switch(norm_gparam > self.grad_threshold, (self.grad_threshold / norm_gparam) * gparam, gparam) self.gparams.append(clipped_gparam) def train(self, train_set, valid_set=None, learning_rate=0.1, num_epochs=500, save=False, output_folder=None, lr_update=True, mom_rate=0.9, update_type='linear', begin_anneal=50, start=2, filename=None): print 'Initializing training.' self.best_cost = numpy.inf self.init_lr = learning_rate self.lr = numpy.array(learning_rate) self.mom_rate = mom_rate self.output_folder = output_folder self.train_set = train_set self.valid_set = valid_set self.save = save self.lr_update = lr_update self.stop_train = False self.train_costs = [] self.valid_costs = [] self.num_epochs = num_epochs self.start = start - 1 #subtracting one for zero index. self.filename = filename try: for u in xrange(num_epochs): cost = [] for i in self.train_set.iterate(True): inputs = i + [self.lr] if self.momentum: inputs = inputs + [self.mom_rate] cost_no_update = self.calc_cost( *i)[0] #The first cost in the list is the objective #print cost_no_update if numpy.isnan(cost_no_update): print 'Cost was NaN for a particular batch!' break else: cost.append(self.f(*inputs)) if numpy.isnan(cost_no_update): break mean_costs = numpy.mean(cost, axis=0) if numpy.isnan(mean_costs[0]): print 'Training cost is NaN.' print 'Breaking from training early, the last saved set of parameters is still usable!' break print ' Epoch %i ' % (u + 1) print '***Train Results***' for i in xrange(self.num_costs): print "Cost %i: %f" % (i, mean_costs[i]) self.train_costs.append(mean_costs) if not valid_set: this_cost = numpy.absolute(numpy.mean(cost, axis=0)) if this_cost < self.best_cost: self.best_cost = this_cost print 'Best Params!' if save: self.save_model() sys.stdout.flush() else: self.perform_validation() if self.stop_train: print 'Stopping training early.' break if lr_update: self.update_lr(u + 1, update_type='linear', start=self.start, num_iterations=self.num_epochs) print 'Training completed!' except KeyboardInterrupt: print 'Training interrupted.' def perform_validation(self, ): cost = [] for i in self.valid_set.iterate(True): cost.append(self.calc_cost(*i)) mean_costs = numpy.mean(cost, axis=0) self.valid_costs.append(mean_costs) print '***Validation Results***' for i in xrange(self.num_costs): print "Cost %i: %f" % (i, mean_costs[i]) this_cost = numpy.absolute(numpy.mean( cost, axis=0))[0] #The first cost is the objective function if this_cost < self.best_cost: self.patience = 0 self.best_cost = this_cost print 'Best Params!' if self.save: self.save_model() else: self.patience += 1 print 'Patience: %d/%d' % (self.patience, self.max_patience) if self.patience >= self.max_patience: self.stop_train = True def save_model(self, filename=None): print 'Saving model parameters.' best_params = [param.get_value().copy() for param in self.params] if not self.output_folder: if not filename: cPickle.dump(best_params, open('best_params.pickle', 'w')) else: cPickle.dump(best_params, open(filename, 'w')) else: if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) if not filename: save_path = os.path.join(self.output_folder, 'best_params.pickle') else: save_path = os.path.join(self.output_folder, filename) cPickle.dump(best_params, open(save_path, 'w')) def update_lr(self, count, update_type='annealed', begin_anneal=500., min_lr=0.01, decay_factor=1.2, start=2, num_iterations=1000): if update_type == 'annealed': scale_factor = float(begin_anneal) / count self.lr = self.init_lr * min(1., scale_factor) elif update_type == 'exponential': new_lr = float(self.init_lr) / (decay_factor**count) if new_lr < min_lr: self.lr = min_lr else: self.lr = new_lr elif update_type == 'linear': slope = self.init_lr / ( num_iterations - start ) #Ensure this is never zero, num_iterations must be > 2 if count >= start: self.lr = self.init_lr - count * slope print 'Updated lr: ', self.lr
def compile(self): print "$> Compiling optimizer." #TODO: automate "install" of new methods. # instead of this switch, should check if the input string is # the name of a valid method if self.method.lower() == 'sgd': updates = top.up.sgd(self.p, cost=self.cost, lr=self.lr, momentum=self.m, lr_rate=self.lr_rate, m_rate=self.m_rate, consider_cosntant=self.cc) elif self.method.lower() == 'rmsprop': updates = top.up.rmsprop(self.p, self.cost, lr=self.lr, momentum=self.m, lr_rate=self.lr_rate, m_rate=self.m_rate, consider_constant=self.cc, grad_clip=self.grad_clip) elif self.method.lower() == 'adam': updates = top.up.adam(self.p, self.cost, lr=self.lr, grad_clip=self.grad_clip) elif self.method.lower() == 'adagrad': updates = top.up.adagrad(self.p, self.cost, lr=self.lr, lr_rate=self.lr_rate) else: raise NotImplementedError("Optimization method not implemented!") updates = OrderedDict(updates) if self.extra_updates is not None: updates.update(self.extra_updates) # This may seem weird, but I was getting bugs without this if-else if self.input == []: # Return cost and update params self.f = theano.function([], self.cost, updates=updates, givens=self.givens, allow_input_downcast=True) # Return cost without updating params, use this for testing self.g = theano.function([], self.cost, givens=self.givens, allow_input_downcast=True) else: if not isinstance(self.input, list): self.input = [self.input] self.f = theano.function(self.input, self.cost, updates=updates, givens=self.givens, allow_input_downcast=True) self.g = theano.function(self.input, self.cost, givens=self.givens, allow_input_downcast=True) return self
def setup_training(self): """ Sets up training function. """ training_batch_size = self.mini_batch_size cost = self.cnn.get_default_cost() data_specs = cost.get_data_specs(self.cnn) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=training_batch_size).astype("float32") theano_args.append(arg) theano_args = tuple(theano_args) y_hat = self.cnn.fprop(theano_args[0]) self.fprop_func = theano.function([theano_args[0]], y_hat) cost = self.cnn.cost(theano_args[1], y_hat) lr_scalers = self.cnn.get_lr_scalers() params = list(self.cnn.get_params()) grads = T.grad(cost, params, disconnected_inputs='ignore') gradients = OrderedDict(izip(params, grads)) rms_vals_dict = OrderedDict(izip(params, self.rms_vals)) updates = OrderedDict() updates.update(dict(safe_zip(params, [param - self.learning_rate * (gradients[param] / T.sqrt(rms_vals_dict[param] + 1e-8)) for param in params]))) rmsprop_updates = OrderedDict() rmsprop_updates.update(dict(safe_zip(self.rms_vals, [(rms_vals_dict[param] * .9) + (T.sqr(gradients[param]) * .1) for param in params]))) self.training = theano.function(theano_args, updates=updates, on_unused_input='ignore') self.rmsprop_update = theano.function(theano_args, updates=rmsprop_updates, on_unused_input='ignore') temp = T.tensor4() self.dimshuf_func = theano.function([temp], temp.dimshuffle(1, 2, 3, 0)) #self.grads_func = theano.function(theano_args, grads) self.cost_function = theano.function(theano_args, cost)
n_nit_sot += 1 # Step 5.5 all other arguments including extra inputs other_scan_args = [] other_inner_args = [] other_scan_args += [arg for arg in non_seqs if (not isinstance(arg, SharedVariable) and not isinstance(arg, tensor.Constant))] # Step 5.6 all shared variables with no update rules other_inner_args += [safe_new(arg, '_copy') for arg in non_seqs if (not isinstance(arg, SharedVariable) and not isinstance(arg, tensor.Constant))] givens.update(OrderedDict(zip(other_scan_args, other_inner_args))) if strict: non_seqs_set = set(non_sequences if non_sequences != None else []) other_shared_scan_args = [arg.variable for arg in dummy_f.maker.expanded_inputs if (isinstance(arg.variable, SharedVariable) and not arg.update and arg.variable in non_seqs_set)] other_shared_inner_args = [safe_new(arg.variable, '_copy') for arg in dummy_f.maker.expanded_inputs if (isinstance(arg.variable, SharedVariable) and not arg.update and arg.variable in non_seqs_set)] else:
class ConvSparseCoding(ConvElemwise): ''' Parameters for the optimization/feedforward operation: lr : learning rate n_steps : number of steps or uptades of the hidden code truncate: truncate the gradient after this number (default -1 which means do not truncate) ''' def __init__(self, batch_size, x_axes=['b', 'c', 0, 1], fprop_code=True, lr=.01, n_steps=10, truncate=-1, *args, **kwargs): super(ConvSparseCoding, self).__init__(*args, **kwargs) self.batch_size = batch_size self.fprop_code = fprop_code self.n_steps = n_steps self.truncate = truncate self.lr = lr self._scan_updates = OrderedDict() def initialize_x_space(self,rng): """ This function initializes the coding space and dimmensions X is how I generally call the sparse code variables. Thus, X_space has its dimmensions """ dummy_batch_size = self.mlp.batch_size if dummy_batch_size is None: dummy_batch_size = self.batch_size dummy_detector =\ sharedX(self.detector_space.get_origin_batch(dummy_batch_size)) if self.pool_type is not None: assert self.pool_type in ['max', 'mean'] if self.pool_type == 'max': dummy_p = max_pool(dummy_detector, self.pool_shape) ''' pool_stride=self.pool_stride, image_shape=self.detector_space) ''' elif self.pool_type == 'mean': dummy_p = mean_pool(dummy_detector, self.pool_shape) ''' pool_stride=self.pool_stride, image_shape=self.detector_shape) ''' dummy_p = dummy_p.eval() self.x_space = Conv2DSpace(shape=[dummy_p.shape[2], dummy_p.shape[3]], num_channels= self.output_channels, axes=('b', 'c', 0, 1)) else: dummy_detector = dummy_detector.eval() self.x_space = Conv2DSpace(shape=[dummy_detector.shape[2], dummy_detector.shape[3]], num_channels=self.output_channels, axes=('b', 'c', 0, 1)) X = rng.normal(0, .001, size=(dummy_batch_size, self.output_channels, self.detector_space.shape[0], self.detector_space.shape[1])) self.X = sharedX(X, self.layer_name+'_X') logger.info('Code space: {0}'.format(self.x_space.shape)) @wraps(ConvElemwise.initialize_transformer) def initialize_transformer(self, rng): """ This function initializes the transformer of the class. Re-running this function will reset the transformer. X is how I generally call the sparse code variables. Thus, X_space has its dimmensions Parameters ---------- rng : object random number generator object. """ if self.irange is not None: assert self.sparse_init is None self.transformer = conv2d.make_random_conv2D( irange=self.irange, input_space=self.x_space, output_space=self.input_space, kernel_shape=self.kernel_shape, subsample=self.kernel_stride, border_mode=self.border_mode, rng=rng) elif self.sparse_init is not None: self.transformer = conv2d.make_sparse_random_conv2D( num_nonzero=self.sparse_init, input_space=self.X_space, output_space=self.detector_space, kernel_shape=self.kernel_shape, subsample=self.kernel_stride, border_mode=self.border_mode, rng=rng) def get_local_cost(self, state_below): er = T.sqr(state_below - self.transformer.lmul(self.X)).sum() l1 = T.sqrt( T.sqr(self.X) + 1e-6).sum() return er + .1 * l1 @wraps(ConvElemwise.initialize_output_space) def initialize_output_space(self): if self.fprop_code is True: self.output_space = self.x_space else: self.output_space = self.input_space logger.info('Output space: {0}'.format(self.output_space.shape)) @wraps(Layer.set_input_space) def set_input_space(self, space): """ Note: this function will reset the parameters! """ self.input_space = space if not isinstance(space, Conv2DSpace): raise BadInputSpaceError(self.__class__.__name__ + ".set_input_space " "expected a Conv2DSpace, got " + str(space) + " of type " + str(type(space))) rng = self.mlp.rng output_shape = [(self.input_space.shape[0] + self.kernel_shape[0]) / self.kernel_stride[0] - 1, (self.input_space.shape[1] + self.kernel_shape[1]) / self.kernel_stride[1] - 1] self.detector_space = Conv2DSpace(shape=output_shape, num_channels=self.output_channels, axes=('b', 'c', 0, 1)) self.initialize_x_space(rng) self.initialize_transformer(rng) W, = self.transformer.get_params() W.name = self.layer_name + '_W' if self.tied_b: self.b = sharedX(np.zeros((self.detector_space.num_channels)) + self.init_bias) else: self.b = sharedX(self.detector_space.get_origin() + self.init_bias) self.b.name = self.layer_name + '_b' logger.info('Input shape: {0}'.format(self.input_space.shape)) logger.info('Detector space: {0}'.format(self.detector_space.shape)) self.initialize_output_space() def _renormW(self): A = self.transformer.get_params()[0].get_value(borrow=True) Ashape = A.shape A = A.reshape((Ashape[0]*Ashape[1],Ashape[2]*Ashape[3])) A = np.dot(A.T, np.diag(1./np.sqrt(np.sum(A**2, axis=1)))).T A = A.reshape(Ashape) self.transformer.get_params()[0].set_value( A ) def get_sparse_code(self, state_below): def _optimization_step(Xt, accum, vt, S): ''' Note that this is the RMSprop update. Thus, we running gradient updates inside scan (the dream) TODO: put this a better place. I tried to make if a method of self, but I'm not sure how to tell theano.scan that the first argument of the function is a non_sequence ''' rho = .9 momentum = .9 lr = self.lr Y = self.transformer.lmul(Xt) #T.dot(Xt, self.W) #+ self.b err = (S - Y) ** 2 l1 = T.sqrt(Xt**2 + 1e-6) cost = err.sum() + .1 * l1.sum() #cost = self.get_local_cost(S) gX = T.grad(cost, Xt) new_accum = rho * accum + (1-rho) * gX**2 v = momentum * vt - lr * gX / T.sqrt(new_accum + 1e-8) X = Xt + momentum * v - lr * gX / T.sqrt(new_accum + 1e-8) return [X, new_accum, v] # Renorm W self._renormW() rng = self.mlp.rng #X = rng.randn(self.batch_size, self.dim) #self.X = sharedX(X, 'SparseCodingLinear_X') accum = T.zeros_like(self.X) vt = T.zeros_like(self.X) [Xfinal,_,_], updates = theano.scan(fn=_optimization_step, outputs_info=[self.X, accum, vt], non_sequences=[state_below], n_steps=self.n_steps, truncate_gradient=self.truncate) self._scan_updates.update(updates) self.Xout = Xfinal[-1] #self.Xout = (2*T.ge(self.Xout, 0.)-1) * T.maximum(abs(self.Xout) - .01, 0.) self.state_below = state_below #self.local_reconstruction_error = \ # ((state_below - T.dot(self.Xout, self.W) - 0*self.b) ** 2).sum() + \ # .1 * T.sqrt(self.Xout**2 + 1e-6).sum() return self.Xout @wraps(Layer._modify_updates) def _modify_updates(self, updates): updates.update(self._scan_updates) def get_nonlin_output(self, state_below): rval = max_pool(self.X, self.pool_shape) rval = self.nonlin.apply(rval) return rval @wraps(Layer.fprop) def fprop(self, state_below): self.input_space.validate(state_below) rval = self.get_sparse_code(state_below) if self.fprop_code == True: #rval = T.switch(rval > 0., rval, 0.) rval = self.get_nonlin_output(state_below) else: # Fprops the filtered input instead rval = self.transformer.lmul(rval) self.output_space.validate(rval) return rval
class Training(PickleMixin, TheanoMixin): """ WRITEME Parameters ---------- .. todo:: """ def __init__(self, name, data, model, optimizer, cost, outputs, debug_print=0, trainlog=None, extension=None): self.name = name self.data = data self.model = model self.optimizer = optimizer self.inputs = model.inputs self.cost = cost self.outputs = tolist(outputs) self.updates = OrderedDict() self.updates.update(model.updates) self.extension = extension self.debug_print = debug_print lr_scalers = OrderedDict() for node in self.model.nodes: lr_scalers[node.name] = node.lr_scaler self.optimizer.lr_scalers = lr_scalers t0 = time.time() self.cost_fn = self.build_training_graph() print "Elapsed compilation time: %f" % (time.time() - t0) if self.debug_print: from theano.printing import debugprint debugprint(self.cost_fn) if trainlog is None: self.trainlog = TrainLog() else: self.trainlog = trainlog self.endloop = 0 def build_training_graph(self): self.run_extension('ext_regularize_pre_grad') self.grads = OrderedDict( izip(self.model.params.values(), T.grad(self.cost, self.model.params.values()))) self.run_extension('ext_grad') grads = self.optimizer.get_updates(self.grads) for key, val in grads.items(): self.updates[key] = val self.run_extension('ext_regularize_post_grad') return self.build_theano_graph(self.inputs, self.outputs, self.updates) def run(self): logger.info("Entering main loop") while self.run_epoch(): pass logger.info("Terminating main loop") def run_epoch(self): for batch in self.data: self.run_extension('ext_monitor') self.run_extension('ext_save') batch_t0 = time.time() this_cost = self.cost_fn(*batch) self.trainlog.monitor['time'].append(time.time() - batch_t0) self.trainlog.monitor['update'].append(this_cost) self.trainlog.batch_seen += 1 self.run_extension('ext_schedule') self.trainlog.epoch_seen += 1 self.run_extension('ext_term') if self.end_training(): self.run_extension('ext_monitor') self.run_extension('ext_save') return False return True def find_extension(self, name): try: exts = [ extension for extension in self.extension if extension.name == name ] if len(exts) > 0: return_val = 1 else: return_val = 0 return return_val, exts except: return (0, None) def run_extension(self, name): tok, exts = self.find_extension(name) if tok: for ext in exts: ext.exe(self) def end_training(self): return self.endloop
class Training(PickleMixin, TheanoMixin): """ WRITEME Parameters ---------- .. todo:: """ def __init__(self, name, data, model, optimizer, cost, outputs, debug_print=0, trainlog=None, extension=None): self.name = name self.data = data self.model = model self.optimizer = optimizer self.inputs = model.inputs self.cost = cost self.outputs = tolist(outputs) self.updates = OrderedDict() self.updates.update(model.updates) self.extension = extension self.debug_print = debug_print lr_scalers = OrderedDict() for node in self.model.nodes: lr_scalers[node.name] = node.lr_scaler self.optimizer.lr_scalers = lr_scalers t0 = time.time() self.cost_fn = self.build_training_graph() print "Elapsed compilation time: %f" % (time.time() - t0) if self.debug_print: from theano.printing import debugprint debugprint(self.cost_fn) if trainlog is None: self.trainlog = TrainLog() else: self.trainlog = trainlog self.endloop = 0 def build_training_graph(self): self.run_extension('ext_regularize_pre_grad') self.grads = OrderedDict(izip(self.model.params.values(), T.grad(self.cost, self.model.params.values()))) self.run_extension('ext_grad') grads = self.optimizer.get_updates(self.grads) for key, val in grads.items(): self.updates[key] = val self.run_extension('ext_regularize_post_grad') return self.build_theano_graph(self.inputs, self.outputs, self.updates) def run(self): logger.info("Entering main loop") while self.run_epoch(): pass logger.info("Terminating main loop") def run_epoch(self): for batch in self.data: self.run_extension('ext_monitor') self.run_extension('ext_save') batch_t0 = time.time() this_cost = self.cost_fn(*batch) self.trainlog.monitor['time'].append(time.time() - batch_t0) self.trainlog.monitor['update'].append(this_cost) self.trainlog.batch_seen += 1 self.run_extension('ext_schedule') self.trainlog.epoch_seen += 1 self.run_extension('ext_term') if self.end_training(): self.run_extension('ext_monitor') self.run_extension('ext_save') return False return True def find_extension(self, name): try: exts = [extension for extension in self.extension if extension.name == name] if len(exts) > 0: return_val = 1 else: return_val = 0 return return_val, exts except: return (0, None) def run_extension(self, name): tok, exts = self.find_extension(name) if tok: for ext in exts: ext.exe(self) def end_training(self): return self.endloop
def get_func(learn_discriminator, learn_generator, dont_you_fucking_dare_touch_the_generator=False): updates = OrderedDict() assert (learn_discriminator or learn_generator) and not (learn_discriminator and learn_generator) if learn_discriminator: cur_params = model.discriminator.get_params() else: cur_params = model.generator.get_params() def check(): for param in params: if param not in cur_params: assert param not in updates cur_grads = OrderedDict() for param in cur_params: cur_grads[param] = grads[param] for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % {'costname': cost_value.name, 'paramname': param.name}) assert grads[param].dtype == param.dtype cur_lr_scalers = OrderedDict() for param in cur_params: if param in lr_scalers: lr_scaler = lr_scalers[param] cur_lr_scalers[param] = lr_scaler log.info('Parameter and initial learning rate summary:') for param in cur_params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * cur_lr_scalers.get(param,1.) log.info('\t' + param_name + ': ' + str(lr)) updates.update(self.learning_rule.get_updates( learning_rate, cur_grads, cur_lr_scalers)) check() for param in cur_params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' check() model.modify_updates(updates) check() for param in cur_params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) check() if dont_you_fucking_dare_touch_the_generator: for param in model.generator.get_params(): assert param not in updates with log_timing(log, 'Compiling sgd_update'): return function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode)
def __init__(self, objective, params, inputs = None, param_constrainers = None, max_iter = -1, lr_scalers = None, verbose = 0, tol = None, init_alpha = None, min_init_alpha = 1e-3, reset_alpha = True, conjugate = False, reset_conjugate = True, gradients = None, gradient_updates = None, line_search_mode = None, accumulate = False, theano_function_mode=None): self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [ param for param in params ] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX( param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: logger.info('batch gradient class compiling gradient function') t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates = updates) else: self._compute_grad = function(inputs, updates = updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() logger.info('done. Took {0}'.format(t2-t1)) if self.verbose: logger.info('batch gradient class compiling objective function') if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: logger.info('done') self.param_to_cache = OrderedDict() alpha = T.scalar(name = 'alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g_], g_ * norm) for g_ in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered] def dot_product(x, y): return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) #beta_pr is the Polak-Ribiere formula for beta. #According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste" #but max(0, beta_pr) is "a popular choice... which provides direction reset automatically." #(ie, it is meant to revert to steepest descent when you have traveled far enough that #the objective function is behaving non-quadratically enough that the conjugate gradient #formulas aren't working anymore) #http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method assert grad not in grad_to_old_grad make_conjugate_updates = [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \ + var_descriptor(v) + '\n') mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \ + var_descriptor(u) + '\n') self._make_conjugate = function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \ + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
class SGD_Optimizer(): def __init__(self, params, inputs, costs, updates_old=None, consider_constant=[], momentum=True): """ params: parameters of the model inputs: list of symbolic inputs to the graph costs: list of costs to be evaluated. The first element MUST be the objective. updates_old: OrderedDict from previous graphs that need to be accounted for by SGD, typically when scan is used. consider_constant: list of theano variables that are passed on to the grad method. Typically RBM. """ self.inputs = inputs self.params = params self.momentum = momentum if self.momentum: self.params_mom = [] for param in self.params: param_init = theano.shared(value=numpy.zeros( param.get_value().shape, dtype=theano.config.floatX), name=param.name + '_mom') self.params_mom.append(param_init) self.costs = costs self.num_costs = len(costs) assert ( isinstance(costs, list) ), "The costs given to the SGD class must be a list, even for one element." self.updates_old = updates_old self.consider_constant = consider_constant self.build_train_fn() def build_train_fn(self, ): self.lr_theano = T.scalar('lr') self.grad_inputs = self.inputs + [self.lr_theano] if self.momentum: self.mom_theano = T.scalar('mom') self.grad_inputs = self.grad_inputs + [self.mom_theano] self.gparams = T.grad(self.costs[0], self.params, consider_constant=self.consider_constant) if not self.momentum: print 'Building SGD optimization graph without momentum' updates = OrderedDict((i, i - self.lr_theano * j) for i, j in zip(self.params, self.gparams)) else: print 'Building SGD optimization graph with momentum' updates = OrderedDict() for param, param_mom, gparam in zip(self.params, self.params_mom, self.gparams): param_inc = self.mom_theano * param_mom - self.lr_theano * gparam updates[param_mom] = param_inc updates[param] = param + param_inc self.calc_cost = theano.function(self.inputs, self.costs) if self.updates_old: updates_old = copy.copy( updates_old ) #To avoid updating the model dict if updates dict belongs to model class, very unlikely case. self.updates_old.update(updates) else: self.updates_old = OrderedDict() self.updates_old.update(updates) self.f = theano.function(self.grad_inputs, self.costs, updates=self.updates_old) def train(self, train_set, valid_set=None, learning_rate=0.1, num_epochs=500, save=False, output_folder=None, lr_update=None, mom_rate=0.9): self.best_cost = numpy.inf self.init_lr = learning_rate self.lr = numpy.array(learning_rate) self.mom_rate = mom_rate self.output_folder = output_folder self.train_set = train_set self.valid_set = valid_set self.save = save self.lr_update = lr_update try: for u in xrange(num_epochs): cost = [] for i in self.train_set.iterate(True): inputs = i + [self.lr] if self.momentum: inputs = inputs + [self.mom_rate] cost.append(self.f(*inputs)) mean_costs = numpy.mean(cost, axis=0) print ' Epoch %i ' % (u + 1) print '***Train Results***' for i in xrange(self.num_costs): print "Cost %i: %f" % (i, mean_costs[i]) if not valid_set: this_cost = numpy.absolute(numpy.mean(cost, axis=0)) if this_cost < best_cost: best_cost = this_cost print 'Best Params!' if save: self.save_model() sys.stdout.flush() else: self.perform_validation() if lr_update: self.update_lr(u + 1, begin_anneal=1) except KeyboardInterrupt: print 'Training interrupted.' def perform_validation(self, ): cost = [] for i in self.valid_set.iterate(True): cost.append(self.calc_cost(*i)) mean_costs = numpy.mean(cost, axis=0) print '***Validation Results***' for i in xrange(self.num_costs): print "Cost %i: %f" % (i, mean_costs[i]) this_cost = numpy.absolute(numpy.mean( cost, axis=0))[1] #Using accuracy as metric if this_cost < self.best_cost: self.best_cost = this_cost print 'Best Params!' if self.save: self.save_model() def save_model(self, ): best_params = [param.get_value().copy() for param in self.params] if not self.output_folder: cPickle.dump(best_params, open('best_params.pickle', 'w')) else: if not os.path.exists(self.output_folder): os.makedirs(self.output_folder) save_path = os.path.join(self.output_folder, 'best_params.pickle') cPickle.dump(best_params, open(save_path, 'w')) def update_lr(self, count, update_type='annealed', begin_anneal=500., min_lr=0.01, decay_factor=1.2): if update_type == 'annealed': scale_factor = float(begin_anneal) / count self.lr = self.init_lr * min(1., scale_factor) if update_type == 'exponential': new_lr = float(self.init_lr) / (decay_factor**count) if new_lr < min_lr: self.lr = min_lr else: self.lr = new_lr
def get_func(learn_discriminator, learn_generator): updates = OrderedDict() assert (learn_discriminator or learn_generator ) and not (learn_discriminator and learn_generator) if learn_discriminator: cur_params = model.discriminator.get_params() else: cur_params = model.generator.get_params() cur_grads = OrderedDict() for param in cur_params: cur_grads[param] = grads[param] for param in grads: if grads[param].name is None and cost_value is not None: grads[param].name = ('grad(%(costname)s, %(paramname)s)' % { 'costname': cost_value.name, 'paramname': param.name }) assert grads[param].dtype == param.dtype cur_lr_scalers = OrderedDict() for param in cur_params: if param in lr_scalers: lr_scaler = lr_scalers[param] cur_lr_scalers[param] = lr_scaler log.info('Parameter and initial learning rate summary:') for param in cur_params: param_name = param.name if param_name is None: param_name = 'anon_param' lr = learning_rate.get_value() * cur_lr_scalers.get(param, 1.) log.info('\t' + param_name + ': ' + str(lr)) if self.learning_rule: updates.update( self.learning_rule.get_updates(learning_rate, cur_grads, cur_lr_scalers)) else: # Use standard SGD updates with fixed learning rate. updates.update( dict(safe_zip(params, [param - learning_rate * \ lr_scalers.get(param, 1.) * grads[param] for param in params]))) for param in cur_params: if updates[param].name is None: updates[param].name = 'sgd_update(' + param.name + ')' model.modify_updates(updates) for param in cur_params: update = updates[param] if update.name is None: update.name = 'censor(sgd_update(' + param.name + '))' for update_val in get_debug_values(update): if np.any(np.isinf(update_val)): raise ValueError("debug value of %s contains infs" % update.name) if np.any(np.isnan(update_val)): raise ValueError("debug value of %s contains nans" % update.name) with log_timing(log, 'Compiling sgd_update'): return function(theano_args, updates=updates, name='sgd_update', on_unused_input='ignore', mode=self.theano_function_mode)
def rescale_dropout_fprop(self, state_below, default_input_include_prob=0.5, input_include_probs=None, default_input_scale=2., input_scales=None, per_example=True): """ Returns the output of the MLP, when applying dropout to the input and intermediate layers. Each input to each layer is randomly included or excluded for each example. The probability of inclusion is independent for each input and each example. Each layer uses `default_input_include_prob` unless that layer's name appears as a key in input_include_probs, in which case the input inclusion probability is given by the corresponding value. Each feature is also multiplied by a scale factor. The scale factor for each layer's input scale is determined by the same scheme as the input probabilities. Parameters ---------- state_below : WRITEME The input to the MLP default_input_include_prob : WRITEME input_include_probs : WRITEME default_input_scale : WRITEME input_scales : WRITEME per_example : bool, optional Sample a different mask value for every example in a batch. Defaults to `True`. If `False`, sample one mask per mini-batch. """ warnings.warn("dropout doesn't use fixed_var_descr so it won't work " "with algorithms that make more than one theano " "function call per batch, such as BGD. Implementing " "fixed_var descr could increase the memory usage " "though.") if input_include_probs is None: input_include_probs = {} if input_scales is None: input_scales = {} self._validate_layer_names(list(input_include_probs.keys())) self._validate_layer_names(list(input_scales.keys())) theano_rng = MRG_RandomStreams(max(self.rng.randint(2**15), 1)) dynamic_scale = OrderedDict() for layer in self.layers: layer_name = layer.layer_name if layer_name in input_include_probs: include_prob = input_include_probs[layer_name] else: include_prob = default_input_include_prob if layer_name in input_scales: scale = input_scales[layer_name] else: scale = default_input_scale state_below = self.apply_dropout( state=state_below, include_prob=include_prob, theano_rng=theano_rng, scale=scale, mask_value=layer.dropout_input_mask_value, input_space=layer.get_input_space(), per_example=per_example) if hasattr(layer, 'dynamic_scale'): dynamic_scale.update(layer.dynamic_scale(state_below)) else: print 'skipping', layer.layer_name state_below = layer.fprop(state_below) return state_below, dynamic_scale
class Training(PickleMixin, TheanoMixin): """ WRITEME Parameters ---------- .. todo:: """ def __init__(self, name, data, model, optimizer, cost, outputs, n_steps, debug_print=0, trainlog=None, extension=None, lr_iterations=None, decay_schedule=2, k_speedOfconvergence=40): #picklelized? self.name = name # yes self.data = data # no self.model = model #yes self.optimizer = optimizer #no self.inputs = model.inputs #no self.cost = cost #yes self.outputs = tolist(outputs) #no self.updates = OrderedDict() # no self.updates.update(model.updates) #??? self.extension = extension #no self.debug_print = debug_print #no lr_scalers = OrderedDict() #yes for node in self.model.nodes: #should lr_scalers[node.name] = node.lr_scaler self.optimizer.lr_scalers = lr_scalers #should self.nBernoulli = np.ones((n_steps, )) #yes t0 = time.time() self.cost_fn = self.build_training_graph() # no but should print "Elapsed compilation time: %f" % (time.time() - t0) if self.debug_print: #no from theano.printing import debugprint debugprint(self.cost_fn) if trainlog is None: #yes self.trainlog = TrainLog() else: self.trainlog = trainlog self.endloop = 0 #no self.lr_iterations = lr_iterations #yes self.lastBatchlastPoch = 0 #yes self.decay_schedule = decay_schedule #yes self.k = k_speedOfconvergence #yes self.schedRate = 1 #yes self.n_steps = n_steps #yes def restore(self, data, optimizer, cost, outputs, n_steps, debug_print=0, trainlog=None, extension=None, lr_iterations=None, decay_schedule=2, k_speedOfconvergence=40): self.data = data self.optimizer = optimizer self.inputs = self.model.inputs self.cost = cost self.outputs = tolist(outputs) #self.updates = OrderedDict() #self.updates.update(self.model.updates) self.updates = self.model.updates self.extension = extension self.debug_print = debug_print lr_scalers = OrderedDict() for node in self.model.nodes: lr_scalers[node.name] = node.lr_scaler self.optimizer.lr_scalers = lr_scalers self.nBernoulli = np.ones((n_steps, )) t0 = time.time() self.cost_fn = self.build_training_graph() print "Elapsed compilation time: %f" % (time.time() - t0) if self.debug_print: from theano.printing import debugprint debugprint(self.cost_fn) if trainlog is None: self.trainlog = TrainLog() else: self.trainlog = trainlog self.endloop = 0 self.lr_iterations = lr_iterations self.lastBatchlastPoch = 0 self.decay_schedule = decay_schedule self.k = k_speedOfconvergence self.schedRate = 1 self.n_steps = n_steps ''' def restore(self, data, cost, model, optimizer, k_speedOfconvergence = 40): self.data = data self.cost = cost self.model = model self.optimizer = optimizer self.inputs = model.inputs lr_scalers = OrderedDict() for node in self.model.nodes: lr_scalers[node.name] = node.lr_scaler self.cost_fn = self.build_training_graph() self.k = k_speedOfconvergence ''' def build_training_graph(self): self.run_extension('ext_regularize_pre_grad') self.grads = OrderedDict( izip(self.model.params.values(), T.grad(self.cost, self.model.params.values()))) self.run_extension('ext_grad') grads = self.optimizer.get_updates(self.grads) for key, val in grads.items(): self.updates[key] = val self.run_extension('ext_regularize_post_grad') print(type(self.inputs), len(self.inputs)) #self.inputs.append(self.nBernoulli) return self.build_theano_graph(self.inputs, self.outputs, self.updates) def run(self): logger.info("Entering main loop") while self.run_epoch(): pass logger.info("Terminating main loop") def run_epoch(self): self.trainlog.lastBatchlastEpoch = self.trainlog.batch_seen for batch in self.data: self.run_extension('ext_monitor') self.run_extension('ext_save') batch_t0 = time.time() nBernoulli = [ np.random.binomial(1, self.schedRate) for i in range(self.n_steps) ] nBernoulli = np.asarray(nBernoulli) nBernoulli = np.reshape(nBernoulli, (self.n_steps, )) batchAux = (batch + (nBernoulli, )) this_cost = self.cost_fn(*batchAux) self.trainlog.monitor['time'].append(time.time() - batch_t0) self.trainlog.monitor['update'].append(this_cost) self.trainlog.batch_seen += 1 self.run_extension('ext_schedule') self.trainlog.epoch_seen += 1 first = self.trainlog.epoch_seen / float(self.k) second = self.k + exp(first) self.schedRate = self.k / second for limit, lr_it in self.lr_iterations.items(): if (limit < self.trainlog.epoch_seen): self.optimizer.lr.set_value(lr_it) print("Epoch: {} - seched rate: {}".format(self.trainlog.epoch_seen, self.schedRate)) self.run_extension('ext_term') ## changes the value of endloop if self.end_training(): self.run_extension('ext_monitor') self.run_extension('ext_save') return False return True def find_extension(self, name): try: exts = [ extension for extension in self.extension if extension.name == name ] if len(exts) > 0: return_val = 1 else: return_val = 0 return return_val, exts except: return (0, None) def run_extension(self, name): tok, exts = self.find_extension(name) if tok: for ext in exts: ext.exe(self) def end_training(self): return self.endloop