class WeightLayer(Layer): """ Typical hidden layer with weight parameters to be learned. """ def __init__(self, **kwargs): super(WeightLayer, self).__init__(**kwargs) self.distributable = True self.has_params = True self.params_initialized = False def initialize(self, kwargs): super(WeightLayer, self).initialize(kwargs) req_param(self, ['nin', 'nout']) opt_param(self, ['weight_init'], default_weight_init()) opt_param(self, ['lrule_init'], default_lrule_init()) opt_param(self, ['accumulate'], False) opt_param(self, ['batch_norm'], False) self.weight_init.initialize(self.backend) self.params = [] self.updates = [] if self.batch_norm: self.bn = BatchNorm() kwargs['layer'] = self self.bn.initialize(kwargs) def get_params(self): np_params = dict() for p in ['weights', 'biases']: if hasattr(self, p): p_tensor = getattr(self, p) np_params[p] = np.array(p_tensor.asnumpyarray(), dtype=p_tensor.dtype).reshape( p_tensor.shape) if self.batch_norm: np_params.update(self.bn.get_params()) np_params.update(self.learning_rule.get_params()) return np_params def set_params(self, params_dict): for p in ['weights', 'biases']: if p in params_dict: getattr(self, p)[:] = params_dict[p] if self.batch_norm: self.bn.set_params(params_dict) self.learning_rule.set_params(params_dict) def allocate_param_bufs(self): if self.params_initialized: return make_ebuf = self.backend.empty self.weights = self.weight_init.generate(self.weight_shape, self.weight_dtype) self.weights.name = self.name # naming weights for timing diagnostics self.weight_updates = make_ebuf(self.weight_shape, self.updates_dtype) self.use_biases = 'bias_init' in self.weight_init.__dict__ opt_param(self, ['brule_init'], None) if self.use_biases is True: self.biases = make_ebuf(self.bias_shape, self.weight_dtype) self.biases.fill(self.weight_init.bias_init) self.bias_updates = make_ebuf(self.bias_shape, self.updates_dtype) self.params.extend([self.weights, self.biases]) self.updates.extend([self.weight_updates, self.bias_updates]) else: self.params.extend([self.weights]) self.updates.extend([self.weight_updates]) if self.accumulate: self.utemp = map(lambda x: make_ebuf(x.shape, self.updates_dtype), self.updates) for upm in self.updates: upm.fill(0.0) self.learning_rule = self.init_learning_rule(self.lrule_init) self.bias_rule = None if self.brule_init is not None and self.use_biases: self.bias_rule = self.init_learning_rule(self.brule_init) self.bias_rule.allocate_state([self.updates[-1]]) self.learning_rule.allocate_state(self.updates[:-1]) else: self.learning_rule.allocate_state(self.updates) self.params_initialized = True def update(self, epoch): if self.bias_rule is None: self.learning_rule.apply_rule(self.params, self.updates, epoch) else: self.learning_rule.apply_rule(self.params[:-1], self.updates[:-1], epoch) self.bias_rule.apply_rule([self.params[-1]], [self.updates[-1]], epoch) if self.accumulate: for upm in self.updates: upm.fill(0.0) def normalize_weights(self, wts): norms = self.backend.norm(wts, order=2, axis=1) self.backend.divide(wts, norms.reshape((norms.shape[0], 1)), out=wts) def set_train_mode(self, mode): if self.batch_norm and mode is False: self.bn.set_inference_mode() def init_learning_rule(self, lrule_init): dtype = self.weight_dtype # TODO: Cool to reuse this here? lrname = self.name + '_lr' if lrule_init['type'] == 'gradient_descent': lr = GradientDescent(name=lrname, lr_params=lrule_init['lr_params']) elif lrule_init['type'] == 'gradient_descent_pretrain': lr = GradientDescentPretrain( name=lrname, lr_params=lrule_init['lr_params']) elif lrule_init['type'] == 'gradient_descent_momentum': lr = GradientDescentMomentum( name=lrname, lr_params=lrule_init['lr_params'], param_dtype=dtype, gradient_dtype=dtype) elif lrule_init['type'] == 'gradient_descent_momentum_weight_decay': lr = GradientDescentMomentumWeightDecay( name=lrname, lr_params=lrule_init['lr_params'], param_dtype=dtype, gradient_dtype=dtype) elif lrule_init['type'] == 'adadelta': lr = AdaDelta(name=lrname, lr_params=lrule_init['lr_params']) else: raise AttributeError("invalid learning rule params specified") lr.initialize(self.backend) return lr
class WeightLayer(Layer): """ Typical hidden layer with weight parameters to be learned. """ def __init__(self, **kwargs): super(WeightLayer, self).__init__(**kwargs) self.distributable = True self.has_params = True self.params_initialized = False def initialize(self, kwargs): super(WeightLayer, self).initialize(kwargs) req_param(self, ['nin', 'nout']) opt_param(self, ['weight_init'], default_weight_init()) opt_param(self, ['lrule_init'], default_lrule_init()) opt_param(self, ['accumulate'], False) opt_param(self, ['batch_norm'], False) opt_param(self, ['mempool']) # Used for parallel mode self.weight_init.initialize(self.backend) self.params = [] self.updates = [] if self.batch_norm: self.bn = BatchNorm() kwargs['layer'] = self self.bn.initialize(kwargs) def get_params(self): np_params = dict() for p in ['weights', 'biases']: if hasattr(self, p): p_tensor = getattr(self, p) np_params[p] = p_tensor.asnumpyarray() if self.batch_norm: np_params.update(self.bn.get_params()) np_params.update(self.learning_rule.get_params()) if self.bias_rule is not None: np_params.update(self.bias_rule.get_params()) return np_params def set_params(self, params_dict): for p in ['weights', 'biases']: if p in params_dict: self.backend.set(getattr(self, p), params_dict[p]) if self.batch_norm: self.bn.set_params(params_dict) self.learning_rule.set_params(params_dict) if self.bias_rule is not None: self.bias_rule.set_params(params_dict) def make_views(self): pass def allocate_param_bufs(self): if self.params_initialized: return def make_ebuf(shape, dtype, persist_values): b = self.backend.empty(shape, dtype, persist_values) if self.backend.is_dist: b.ptype = 'replica' if self.is_local else 'vfragment' return b self.weight_init.is_local = self.is_local self.weights = self.weight_init.generate(self.weight_shape, self.weight_dtype) self.weights.name = self.name # naming weights for timing diagnostics self.weight_updates = make_ebuf(self.weight_shape, dtype=self.updates_dtype, persist_values=True) self.make_views() self.use_biases = 'bias_init' in self.weight_init.__dict__ opt_param(self, ['brule_init'], None) if self.use_biases is True: self.biases = make_ebuf(self.bias_shape, dtype=self.weight_dtype, persist_values=False) self.biases.fill(self.weight_init.bias_init) self.bias_updates = make_ebuf(self.bias_shape, dtype=self.updates_dtype, persist_values=False) self.params.extend([self.weights, self.biases]) self.updates.extend([self.weight_updates, self.bias_updates]) else: self.params.extend([self.weights]) self.updates.extend([self.weight_updates]) if self.accumulate: self.utemp = [make_ebuf(x.shape, dtype=self.updates_dtype, persist_values=False) for x in self.updates] for upm in self.updates: upm.fill(0.0) self.learning_rule = self.init_learning_rule(self.lrule_init) self.bias_rule = None if self.brule_init is not None and self.use_biases: lrn = self.learning_rule.name + 'bias' self.bias_rule = self.init_learning_rule(self.brule_init, name=lrn) self.bias_rule.allocate_state([self.updates[-1]]) self.learning_rule.allocate_state(self.updates[:-1]) else: self.learning_rule.allocate_state(self.updates) if self.backend.is_dist: # Create a mempool used for sharing in parallel mode self.make_mempool() self.params_initialized = True def update(self, epoch): if self.is_local and self.backend.is_dist: self.backend.redsynchronize() self.backend.synchronize() # for evt, strm in zip(self.update_events, self.backend.strms): # strm.wait_for_event(evt) if self.bias_rule is None: self.learning_rule.apply_rule(self.params, self.updates, epoch) else: self.learning_rule.apply_rule(self.params[:-1], self.updates[:-1], epoch) self.bias_rule.apply_rule([self.params[-1]], [self.updates[-1]], epoch) if self.accumulate: for upm in self.updates: upm.fill(0.0) def normalize_weights(self, wts): norms = self.backend.norm(wts, order=2, axis=1) self.backend.divide(wts, norms.reshape((norms.shape[0], 1)), out=wts) def set_train_mode(self, mode): if self.batch_norm and mode is False: self.bn.set_inference_mode() def init_learning_rule(self, lrule_init, name=None): dtype = self.weight_dtype # TODO: Cool to reuse this here? if name is None: lrname = self.name + '_lr' else: lrname = name if lrule_init['type'] == 'gradient_descent': lr = GradientDescent(name=lrname, lr_params=lrule_init['lr_params']) elif lrule_init['type'] == 'gradient_descent_pretrain': lr = GradientDescentPretrain( name=lrname, lr_params=lrule_init['lr_params']) elif lrule_init['type'] == 'gradient_descent_momentum': lr = GradientDescentMomentum( name=lrname, lr_params=lrule_init['lr_params'], param_dtype=dtype, gradient_dtype=dtype) elif lrule_init['type'] == 'gradient_descent_momentum_weight_decay': lr = GradientDescentMomentumWeightDecay( name=lrname, lr_params=lrule_init['lr_params'], param_dtype=dtype, gradient_dtype=dtype) elif lrule_init['type'] == 'adadelta': lr = AdaDelta(name=lrname, lr_params=lrule_init['lr_params']) elif lrule_init['type'] == 'rmsprop': lr = RMSProp(name=lrname, lr_params=lrule_init['lr_params'], param_dtype=dtype, gradient_dtype=dtype) else: raise AttributeError("invalid learning rule params specified") lr.initialize(self.backend) return lr
class WeightLayer(Layer): """ Typical hidden layer with weight parameters to be learned. """ def __init__(self, **kwargs): super(WeightLayer, self).__init__(**kwargs) self.distributable = True self.has_params = True self.params_initialized = False def initialize(self, kwargs): super(WeightLayer, self).initialize(kwargs) req_param(self, ['nin', 'nout']) opt_param(self, ['weight_init'], default_weight_init()) opt_param(self, ['lrule_init'], default_lrule_init()) opt_param(self, ['accumulate'], False) opt_param(self, ['batch_norm'], False) self.weight_init.initialize(self.backend) self.params = [] self.updates = [] if self.batch_norm: self.bn = BatchNorm() kwargs['layer'] = self self.bn.initialize(kwargs) def get_params(self): np_params = dict() for p in ['weights', 'biases']: if hasattr(self, p): p_tensor = getattr(self, p) np_params[p] = np.array(p_tensor.asnumpyarray(), dtype=p_tensor.dtype).reshape( p_tensor.shape) if self.batch_norm: np_params.update(self.bn.get_params()) np_params.update(self.learning_rule.get_params()) return np_params def set_params(self, params_dict): for p in ['weights', 'biases']: if p in params_dict: getattr(self, p)[:] = params_dict[p] if self.batch_norm: self.bn.set_params(params_dict) self.learning_rule.set_params(params_dict) def allocate_param_bufs(self): if self.params_initialized: return make_ebuf = self.backend.empty self.weights = self.weight_init.generate(self.weight_shape, self.weight_dtype) self.weights.name = self.name # naming weights for timing diagnostics self.weight_updates = make_ebuf(self.weight_shape, self.updates_dtype) self.use_biases = 'bias_init' in self.weight_init.__dict__ opt_param(self, ['brule_init'], None) if self.use_biases is True: self.biases = make_ebuf(self.bias_shape, self.weight_dtype) self.biases.fill(self.weight_init.bias_init) self.bias_updates = make_ebuf(self.bias_shape, self.updates_dtype) self.params.extend([self.weights, self.biases]) self.updates.extend([self.weight_updates, self.bias_updates]) else: self.params.extend([self.weights]) self.updates.extend([self.weight_updates]) if self.accumulate: self.utemp = map(lambda x: make_ebuf(x.shape, self.updates_dtype), self.updates) for upm in self.updates: upm.fill(0.0) self.learning_rule = self.init_learning_rule(self.lrule_init) self.bias_rule = None if self.brule_init is not None and self.use_biases: self.bias_rule = self.init_learning_rule(self.brule_init) self.bias_rule.allocate_state([self.updates[-1]]) self.learning_rule.allocate_state(self.updates[:-1]) else: self.learning_rule.allocate_state(self.updates) self.params_initialized = True def update(self, epoch): if self.bias_rule is None: self.learning_rule.apply_rule(self.params, self.updates, epoch) else: self.learning_rule.apply_rule(self.params[:-1], self.updates[:-1], epoch) self.bias_rule.apply_rule([self.params[-1]], [self.updates[-1]], epoch) if self.accumulate: for upm in self.updates: upm.fill(0.0) def normalize_weights(self, wts): norms = self.backend.norm(wts, order=2, axis=1) self.backend.divide(wts, norms.reshape((norms.shape[0], 1)), out=wts) def set_train_mode(self, mode): if self.batch_norm and mode is False: self.bn.set_inference_mode() def init_learning_rule(self, lrule_init): dtype = self.weight_dtype # TODO: Cool to reuse this here? lrname = self.name + '_lr' if lrule_init['type'] == 'gradient_descent': lr = GradientDescent(name=lrname, lr_params=lrule_init['lr_params']) elif lrule_init['type'] == 'gradient_descent_pretrain': lr = GradientDescentPretrain( name=lrname, lr_params=lrule_init['lr_params']) elif lrule_init['type'] == 'gradient_descent_momentum': lr = GradientDescentMomentum( name=lrname, lr_params=lrule_init['lr_params'], param_dtype=dtype, gradient_dtype=dtype) elif lrule_init['type'] == 'gradient_descent_momentum_weight_decay': lr = GradientDescentMomentumWeightDecay( name=lrname, lr_params=lrule_init['lr_params'], param_dtype=dtype, gradient_dtype=dtype) elif lrule_init['type'] == 'adadelta': lr = AdaDelta(name=lrname, lr_params=lrule_init['lr_params']) elif lrule_init['type'] == 'rmsprop': lr = RMSProp(name=lrname, lr_params=lrule_init['lr_params'], param_dtype=dtype, gradient_dtype=dtype) else: raise AttributeError("invalid learning rule params specified") lr.initialize(self.backend) return lr