def AccumulateDeriv(self, layer, edge, deriv): """Accumulate the derivative w.r.t the outputs of this layer. A layer needs to compute derivatives w.r.t its outputs. These outputs may have been connected to lots of other nodes through outgoing edges. This method adds up the derivatives contributed by each outgoing edge. It gets derivatives w.r.t the inputs at the other end of its outgoing edge. Args: edge: The edge which is sending the derivative. deriv: The derivative w.r.t the inputs at the other end of this edge. """ if layer.is_input or edge.proto.block_gradient: return if layer.dirty: # If some derivatives have already been received. layer.deriv.add_dot(edge.params['weight'], deriv) else: # Receiving derivative for the first time. cm.dot(edge.params['weight'], deriv, target=layer.deriv) layer.dirty = True
def multiply(self, mat): if self.typ == 'numpy': return PMAT(np.dot(self.mat, mat.mat)) elif self.typ == 'cuda': return PMAT(cm.dot(self.mat, mat.mat))
def ComputeUp(self, layer, train=False, step=0, maxsteps=0): """ Computes the state of `layer', given the state of its incoming neighbours. Args: layer: Layer whose state is to be computed. train: True if this computation is happening during training, False during evaluation. step: Training step. maxsteps: Maximum number of steps that will be taken (Needed because some hyperparameters may depend on this). """ layer.dirty = False perf = None if layer.is_input or layer.is_initialized: layer.GetData() else: for i, edge in enumerate(layer.incoming_edge): if edge in layer.outgoing_edge: continue inputs = layer.incoming_neighbour[i].state if edge.conv or edge.local: if i == 0: ConvolveUp(inputs, edge, layer.state) else: AddConvoleUp(inputs, edge, layer.state) else: w = edge.params['weight'] factor = edge.proto.up_factor if i == 0: cm.dot(w.T, inputs, target=layer.state) if factor != 1: layer.state.mult(factor) else: layer.state.add_dot(w.T, inputs, mult=factor) b = layer.params['bias'] if layer.replicated_neighbour is None: layer.state.add_col_vec(b) else: layer.state.add_dot(b, layer.replicated_neighbour.NN) layer.ApplyActivation() if layer.hyperparams.sparsity: layer.state.sum(axis=1, target=layer.dimsize) perf = deepnet_pb2.Metrics() perf.MergeFrom(layer.proto.performance_stats) perf.count = layer.batchsize perf.sparsity = layer.dimsize.sum() / layer.dimsize.shape[0] if layer.hyperparams.dropout: if train and maxsteps - step >= layer.hyperparams.stop_dropout_for_last: # Randomly set states to zero. if layer.hyperparams.mult_dropout: layer.mask.fill_with_randn() layer.mask.add(1) layer.state.mult(layer.mask) else: layer.mask.fill_with_rand() layer.mask.greater_than(layer.hyperparams.dropout_prob) if layer.hyperparams.blocksize > 1: layer.mask.blockify(layer.hyperparams.blocksize) layer.state.mult(layer.mask) else: # Produce expected output. if layer.hyperparams.mult_dropout: pass else: layer.state.mult(1.0 - layer.hyperparams.dropout_prob) return perf
def multiply(self, mat): if self.typ == "numpy": return PMAT(np.dot(self.mat, mat.mat)) elif self.typ == "cuda": return PMAT(cm.dot(self.mat, mat.mat))