def eval(self, **kwargs): want_clear = kwargs.pop('clear', True) want_bprop_inputs = kwargs.pop('want_bprop_inputs', False) bprop_inputs_loss = kwargs.pop('bprop_inputs_loss', None) self.clear() globals.flags.push("want_bprop_inputs", want_bprop_inputs) globals.flags.push("bprop_mode", want_bprop_inputs) # For each keyword, set the corresponding plug's value for key, val in kwargs.iteritems(): getattr(self, key).fpval = val # Pull the final loss value for this minibatch result = {p.name: p.fpval for p in self.oplugs} # If needed, also pull the backprop'd input deltas and include them in the result if want_bprop_inputs or bprop_inputs_loss: # First set up special backprop values: "Z" (the prediction) backpropagates a negative value # All other output plugs (e.g. costs) backpropagate zero. for p in self.oplugs: if p.name == "Z": bprop_inputs_loss.batchmean = False # Disable scaling gradient by minibatch size bprop_inputs_loss.Z.fpval = result["Z"] bprop_inputs_loss.Y.fpval = sm.zeros_like(result["Z"]) #bprop_inputs_loss.Y.fpval = -1.*sm.ones_like(result["Z"]) p._bpval = bprop_inputs_loss.Z.bpval result['Zmask'] = bprop_inputs_loss.Zmask._fpval # Only backprop gradient of target #0, not the other targets if p._bpval.shape[1] > 1: p._bpval[:, 1:] = sm.zeros_like(p._bpval[:, 1:]) #p._bpval = -result["Z"] #p._bpval = -sm.ones_like(result["Z"]) else: p._bpval = sm.zeros((0, 0)) # Now backpropagate to each input, and store the result if want_bprop_inputs: result.update({ "d" + p.name: p.bpval for p in self.iplugs if p.name in kwargs }) globals.flags.pop("want_bprop_inputs") globals.flags.pop("bprop_mode") # Clear all stored values in the dependency graph, effectively resetting it if want_clear: self.clear() for key in kwargs.iterkeys(): getattr(self, key).fpval = plug_null return result
def eval(self,**kwargs): want_clear = kwargs.pop('clear',True) want_bprop_inputs = kwargs.pop('want_bprop_inputs',False) bprop_inputs_loss = kwargs.pop('bprop_inputs_loss',None) self.clear() globals.flags.push("want_bprop_inputs",want_bprop_inputs) globals.flags.push("bprop_mode",want_bprop_inputs) # For each keyword, set the corresponding plug's value for key,val in kwargs.iteritems(): getattr(self,key).fpval = val # Pull the final loss value for this minibatch result = { p.name : p.fpval for p in self.oplugs } # If needed, also pull the backprop'd input deltas and include them in the result if want_bprop_inputs or bprop_inputs_loss: # First set up special backprop values: "Z" (the prediction) backpropagates a negative value # All other output plugs (e.g. costs) backpropagate zero. for p in self.oplugs: if p.name == "Z": bprop_inputs_loss.batchmean = False # Disable scaling gradient by minibatch size bprop_inputs_loss.Z.fpval = result["Z"] bprop_inputs_loss.Y.fpval = sm.zeros_like(result["Z"]) #bprop_inputs_loss.Y.fpval = -1.*sm.ones_like(result["Z"]) p._bpval = bprop_inputs_loss.Z.bpval result['Zmask'] = bprop_inputs_loss.Zmask._fpval # Only backprop gradient of target #0, not the other targets if p._bpval.shape[1] > 1: p._bpval[:,1:] = sm.zeros_like(p._bpval[:,1:]) #p._bpval = -result["Z"] #p._bpval = -sm.ones_like(result["Z"]) else: p._bpval = sm.zeros((0,0)) # Now backpropagate to each input, and store the result if want_bprop_inputs: result.update({ "d"+p.name : p.bpval for p in self.iplugs if p.name in kwargs}) globals.flags.pop("want_bprop_inputs") globals.flags.pop("bprop_mode") # Clear all stored values in the dependency graph, effectively resetting it if want_clear: self.clear() for key in kwargs.iterkeys(): getattr(self,key).fpval = plug_null return result
def _train_setup(self, trainable_plugs, cost): # For each trainable plug, figure out how many weights it needs. sizes = [ np.prod(p.shape)*cost.ninst for p in trainable_plugs ] offsets = np.asarray(np.cumsum([0] + [ size for size in sizes ]),np.uint32) # Allocate giant contiguous arrays for P, dP, and mP P = sm.zeros((offsets[-1],1)) dP = sm.zeros_like(P) mP = sm.zeros_like(P) # Per-inst learn rates / momentum rates go here. # Giant contiguous array maps to same indicies as in P, dP, mP drate = sm.zeros_like(P) mrate = sm.zeros_like(P) trnodes = [] # For each plug, create a trainable node that is bound to # a chunk of our P (parameter) and dP (gradient) vectors, where the node can for i,tplug in enumerate(trainable_plugs): # Grow the actual shape of the trainable parameters, using the # axis specified by the trainable plug. shape = list(tplug.shape) shape[tplug.inst_axis] *= tplug.node.ninst # Allocate a new trainable node, and connect it to the plug trnode = trainable(P[offsets[i]:offsets[i+1]].reshape(tuple(shape)), dP[offsets[i]:offsets[i+1]].reshape(tuple(shape))) trnode >> tplug trnodes.append(trnode) # Assign instance-specific learning rates and momentum rates # to each corresponding element in the giant drate/mrate vectors if tplug.inst_axis == 0: k = np.prod(tplug.shape) else: k = tplug.shape[1] dratevec = drate[offsets[i]:offsets[i+1]] mratevec = mrate[offsets[i]:offsets[i+1]] _ext.madd_bcast(sm.ones_like(dratevec),self.rate,k,dratevec) _ext.madd_bcast(sm.ones_like(mratevec),self.momentum,k,mratevec) # Also initialize elements of P based on the trainable plug's initialization scale, # which can be different for each individual instance Pvec = P[offsets[i]:offsets[i+1]] initval = tplug.origin().node.init if isinstance(initval, np.ndarray) and initval.ndim == 3: # Specific initialization of individual filters Pvec[:] = sm.asarray(np.require(np.rollaxis(initval,1),requirements="C").reshape((-1,1))) else: # Random initialization _ext.madd_bcast(sm.randn(Pvec.shape[0],Pvec.shape[1]), initval,k,Pvec) if hasattr(tplug.origin().node,'init_mu'): initmu_val = tplug.origin().node.init_mu if isinstance(initmu_val, list): # Specific initialization of individual bias elements initmu_val = np.tile(initmu_val,tplug.origin().node.ninst) Pvec[:] = sm.asarray(initmu_val).reshape(Pvec.shape) else: _ext.madd_bcast(sm.ones_like(Pvec), tplug.origin().node.init_mu,k,Pvec) # Add shift return (P,dP,mP,drate,mrate,trnodes)