def alloc_params(self): # Refer to Ch. 2 pg. 10 of Sutskever's thesis hps = self.hps # Initial hidden state self.params['h0'] = zeros((hps.hidden_size, hps.hidden_layers)) # Input to hidden, note if first layer is recurrent bih is redundant self.params['Wih'] = vp_init((hps.hidden_size, hps.output_size)) self.params['bih'] = zeros((hps.hidden_size, 1)) # recurrent weight # NOTE Initialization important for grad check, don't use vp_init? self.params['Whh'] = vp_init((hps.hidden_size, hps.hidden_size)) self.params['bhh'] = zeros((hps.hidden_size, 1)) # Weights between hidden layers for k in xrange(1, hps.hidden_layers): self.params['Wh%d' % k] = vp_init( (hps.hidden_size, hps.hidden_size)) self.params['bh%d' % k] = zeros((hps.hidden_size, 1)) # Hidden to output self.params['Who'] = vp_init((hps.output_size, hps.hidden_size)) self.params['bho'] = zeros((hps.output_size, 1)) # Keep around last hidden state in case want to resume RNN from there self.last_h = None self.count_params()
def alloc_params(self): # Refer to Ch. 2 pg. 10 of Sutskever's thesis hps = self.hps # Initial hidden state self.params['h0'] = zeros((hps.hidden_size, hps.hidden_layers)) # Input to hidden, note if first layer is recurrent bih is redundant self.params['Wih'] = vp_init((hps.hidden_size, hps.output_size)) self.params['bih'] = zeros((hps.hidden_size, 1)) # recurrent weight # NOTE Initialization important for grad check, don't use vp_init? self.params['Whh'] = vp_init((hps.hidden_size, hps.hidden_size)) self.params['bhh'] = zeros((hps.hidden_size, 1)) # Weights between hidden layers for k in xrange(1, hps.hidden_layers): self.params['Wh%d' % k] = vp_init((hps.hidden_size, hps.hidden_size)) self.params['bh%d' % k] = zeros((hps.hidden_size, 1)) # Hidden to output self.params['Who'] = vp_init((hps.output_size, hps.hidden_size)) self.params['bho'] = zeros((hps.output_size, 1)) # Keep around last hidden state in case want to resume RNN from there self.last_h = None self.count_params()
def alloc_params(self): hps = self.hps self.params['Wih'] = vp_init((hps.hidden_size, hps.input_size)) self.params['bih'] = zeros((hps.hidden_size, 1)) for k in xrange(hps.hidden_layers - 1): self.params['W%d' % (k+1)] = vp_init((hps.hidden_size, hps.hidden_size)) self.params['b%d' % (k+1)] = zeros((hps.hidden_size, 1)) self.params['Who'] = vp_init((hps.output_size, hps.hidden_size)) self.params['bho'] = zeros((hps.output_size, 1)) self.count_params()
def alloc_params(self): hps = self.hps self.params['Wih'] = vp_init((hps.hidden_size, hps.input_size)) self.params['bih'] = zeros((hps.hidden_size, 1)) for k in xrange(hps.hidden_layers - 1): self.params['W%d' % (k + 1)] = vp_init( (hps.hidden_size, hps.hidden_size)) self.params['b%d' % (k + 1)] = zeros((hps.hidden_size, 1)) self.params['Who'] = vp_init((hps.output_size, hps.hidden_size)) self.params['bho'] = zeros((hps.output_size, 1)) self.count_params()
def __init__(self, model, alpha=1e-3, mom=0.95, mom_low=0.5, low_mom_iters=100, max_grad=None, rmsprop=False, rmsprop_decay=0.99): super(MomentumOptimizer, self).__init__(model, alpha) # Momentum coefficient self.mom = mom self.mom_low = mom_low self.low_mom_iters = low_mom_iters self.max_grad = max_grad self.grad_norm = 0.0 # Velocities self.vel = dict() if self.mom > 0: for p in self.params: self.vel[p] = zeros(self.params[p].shape) self.updates = self.vel else: self.vel = self.updates = dict() # Keep track of cost and smoothed cost self.costs = list() self.expcosts = list() self.rmsprop = rmsprop self.rmsprop_decay = rmsprop_decay if rmsprop: # Scale gradients by exponentially weighted average of magnitudes self.msgrads = dict() for p in self.params: self.msgrads[p] = None
def bprop(self): logger.debug("%s backprop" % str(self)) logger.debug("labels shape: %s" % str(self.labels.shape)) # NOTE Assumes ObjectiveNode has no successors batch_size = self.labels.size self.full_grad = zeros(self.pred.out.shape) for k in range(self.labels.size): self.full_grad[self.labels[k], k] = -1.0 / batch_size * (1 / self.pred.out[self.labels[k], k])
def bprop(self): logger.debug('%s backprop' % str(self)) logger.debug('labels shape: %s' % str(self.labels.shape)) # NOTE Assumes ObjectiveNode has no successors batch_size = self.labels.size self.full_grad = zeros(self.pred.out.shape) for k in range(self.labels.size): self.full_grad[ self.labels[k], k] = -1.0 / batch_size * (1 / self.pred.out[self.labels[k], k])
def alloc_params(self): rand_init = lambda shape: rand(shape, rand_range) # PARAM Following Vaswani et al. EMNLP 2013 bias_init = lambda shape: zeros(shape) - np.log(self.vocab_size) # NOTE IndexedParamNode allocates batch of values indexed from C self.C = IndexedParamNode('x = C[:, ks]', self.dset, (embed_size, self.vocab_size), init_fn=rand_init) self.H = ParamNode('H', (hidden_size, context_size*embed_size), init_fn=rand_init) self.d = ParamNode('d', (hidden_size, 1), init_fn=bias_init) self.U = ParamNode('U', (self.vocab_size, hidden_size), init_fn=rand_init) self.b = ParamNode('b', (self.vocab_size, 1), init_fn=bias_init) self.W = ParamNode('W', (self.vocab_size, context_size*embed_size), init_fn=rand_init) self.param_nodes = [self.C, self.H, self.d, self.U, self.b, self.W] logger.info('Allocated parameters')
def alloc_params(self): hps = self.hps self.params['Wih'] = vp_init((hps.hidden_size, hps.input_size)) self.params['Wsh'] = vp_init((hps.hidden_size, hps.source_size)) self.params['bih'] = zeros((hps.hidden_size, 1)) for k in xrange(hps.hidden_layers - 1): self.params['W%d' % (k+1)] = vp_init((hps.hidden_size, hps.hidden_size)) self.params['b%d' % (k+1)] = zeros((hps.hidden_size, 1)) self.params['Who'] = vp_init((hps.output_size, hps.hidden_size)) self.params['bho'] = zeros((hps.output_size, 1)) self.count_params() # Allocate grads as well self.grads = {} for k in self.params: self.grads[k] = empty(self.params[k].shape) logger.info('Allocated gradients')
def _batch_data(batch): images = float_tensor(batch[0].float()) bsize = len(images) return m( images=images, x1=float_tensor(batch[3].float()), x2=float_tensor(batch[4].float()), id_labels=init(batch[1]), pose_labels=init(batch[2]), fake_pose_labels=long_tensor(np.random.randint(args.Np, size=bsize)), ones=ones(bsize), zeros=zeros(bsize), noise=float_tensor(np.random.uniform(-1., 1., (bsize, args.Nz))))
def alloc_params(self): rand_init = lambda shape: rand(shape, rand_range) # PARAM Following Vaswani et al. EMNLP 2013 bias_init = lambda shape: zeros(shape) - np.log(self.vocab_size) # NOTE IndexedParamNode allocates batch of values indexed from C self.C = IndexedParamNode('x = C[:, ks]', self.dset, (embed_size, self.vocab_size), init_fn=rand_init) self.H = ParamNode('H', (hidden_size, context_size * embed_size), init_fn=rand_init) self.d = ParamNode('d', (hidden_size, 1), init_fn=bias_init) self.U = ParamNode('U', (self.vocab_size, hidden_size), init_fn=rand_init) self.b = ParamNode('b', (self.vocab_size, 1), init_fn=bias_init) self.W = ParamNode('W', (self.vocab_size, context_size * embed_size), init_fn=rand_init) self.param_nodes = [self.C, self.H, self.d, self.U, self.b, self.W] logger.info('Allocated parameters')
def cost_and_grad(self, data, labels, back=True, prev_h0=None): hps = self.hps T = data.shape[1] bsize = data.shape[2] # FIXME gnumpy reallocates if try and use same parameters? #us = self.us[:, 0:T, 0:bsize] #dus = self.dus[:, 0:T, 0:bsize] #hs = self.hs[:, 0:T, 0:bsize] #dhs = self.dhs[:, 0:T, 0:bsize] #probs = self.probs[:, 0:T, 0:bsize] #dprobs = self.dprobs[:, 0:T, 0:bsize] #costs = self.costs[0:T, 0:bsize] us = list() dus = list() hs = list() dhs = list() h0 = list() for k in xrange(hps.hidden_layers): us.append(list()) dus.append(list()) hs.append(list()) dhs.append(list()) h0.append(empty((hps.hidden_size, bsize))) for t in xrange(T): us[k].append(zeros((hps.hidden_size, bsize))) dus[k].append(zeros((hps.hidden_size, bsize))) hs[k].append(zeros((hps.hidden_size, bsize))) dhs[k].append(zeros((hps.hidden_size, bsize))) probs = list() for t in xrange(T): probs.append(zeros((hps.output_size, bsize))) costs = np.zeros((T, bsize)) if prev_h0 is not None: h0 = prev_h0 else: for k in xrange(hps.hidden_layers): h0[k] = tile(self.params['h0'][:, k].reshape(-1, 1), bsize) bih = self.params['bih'] Wih = self.params['Wih'] Whh = self.params['Whh'] bhh = self.params['bhh'] Who = self.params['Who'] bho = self.params['bho'] # Forward prop for t in xrange(T): for k in xrange(hps.hidden_layers): if t == 0: hprev = h0[k] else: hprev = hs[k][t-1] if k == 0: us[k][t] = mult(Wih, data[:, t, :]) + bih else: us[k][t] = mult(self.params['Wh%d' % k], hs[k-1][t]) if k == hps.recurrent_layer - 1: us[k][t] += mult(Whh, hprev) + bhh # Clip maximum activation mask = us[k][t] < hps.max_act us[k][t] = us[k][t] * mask + hps.max_act * (1 - mask) elif k != 0: us[k][t] += self.params['bh%d' % k] hs[k][t] = self.nl(us[k][t]) probs[t] = softmax(mult(Who, hs[-1][t]) + bho) self.last_h = list() for k in xrange(hps.hidden_layers): self.last_h.append(hs[k][-1]) if labels is None: return None, probs probs_neg_log = list() dprobs = list() for t in xrange(T): probs_neg_log.append(as_np(-1 * log(probs[t]))) dprobs.append(as_np(probs[t].copy())) for k in xrange(bsize): for t in xrange(len(labels[k])): costs[t, k] = probs_neg_log[t][labels[k][t], k] dprobs[t][labels[k][t], k] -= 1 for t in xrange(T): dprobs[t] = array(dprobs[t]) # NOTE Summing costs over time # NOTE FIXME Dividing by T to get better sense if objective # is decreasing, remove for grad checking cost = costs.sum() / bsize / float(T) if not back: return cost, probs # Backprop for k in self.grads: self.grads[k][:] = 0 for t in reversed(xrange(T)): self.grads['bho'] += dprobs[t][:, :].sum(axis=-1).reshape((-1, 1)) / bsize self.grads['Who'] += mult(dprobs[t], hs[-1][t].T) / bsize for k in reversed(xrange(hps.hidden_layers)): if k == hps.hidden_layers - 1: dhs[k][t] += mult(Who.T, dprobs[t]) else: dhs[k][t] += mult(self.params['Wh%d' % (k+1)].T, dhs[k+1][t]) dus[k][t] += get_nl_grad(self.hps.nl, us[k][t]) * dhs[k][t] if k > 0: self.grads['Wh%d' % k] += mult(dus[k][t], hs[k-1][t].T) / bsize self.grads['bh%d' % k] += dus[k][t].sum(axis=-1).reshape((-1, 1)) / bsize if k == hps.recurrent_layer - 1: if t == 0: hprev = h0[k] self.grads['h0'][:, k] = mult(Whh.T, dus[k][t]).sum(axis=-1) / bsize else: hprev = hs[k][t-1] dhs[k][t-1] = mult(Whh.T, dus[k][t]) self.grads['Whh'] += mult(dus[k][t], hprev.T) / bsize self.grads['bhh'] += dus[k][t].sum(axis=-1).reshape((-1, 1)) / bsize self.grads['Wih'] += mult(dus[0][t], data[:, t, :].T) / bsize self.grads['bih'] += dus[0][t].sum(axis=-1).reshape((-1, 1)) / bsize return cost, self.grads
def cost_and_grad(self, data, labels, back=True, prev_h0=None): hps = self.hps T = data.shape[1] bsize = data.shape[2] # FIXME gnumpy reallocates if try and use same parameters? #us = self.us[:, 0:T, 0:bsize] #dus = self.dus[:, 0:T, 0:bsize] #hs = self.hs[:, 0:T, 0:bsize] #dhs = self.dhs[:, 0:T, 0:bsize] #probs = self.probs[:, 0:T, 0:bsize] #dprobs = self.dprobs[:, 0:T, 0:bsize] #costs = self.costs[0:T, 0:bsize] us = list() dus = list() hs = list() dhs = list() h0 = list() for k in xrange(hps.hidden_layers): us.append(list()) dus.append(list()) hs.append(list()) dhs.append(list()) h0.append(empty((hps.hidden_size, bsize))) for t in xrange(T): us[k].append(zeros((hps.hidden_size, bsize))) dus[k].append(zeros((hps.hidden_size, bsize))) hs[k].append(zeros((hps.hidden_size, bsize))) dhs[k].append(zeros((hps.hidden_size, bsize))) probs = list() for t in xrange(T): probs.append(zeros((hps.output_size, bsize))) costs = np.zeros((T, bsize)) if prev_h0 is not None: h0 = prev_h0 else: for k in xrange(hps.hidden_layers): h0[k] = tile(self.params['h0'][:, k].reshape(-1, 1), bsize) bih = self.params['bih'] Wih = self.params['Wih'] Whh = self.params['Whh'] bhh = self.params['bhh'] Who = self.params['Who'] bho = self.params['bho'] # Forward prop for t in xrange(T): for k in xrange(hps.hidden_layers): if t == 0: hprev = h0[k] else: hprev = hs[k][t - 1] if k == 0: us[k][t] = mult(Wih, data[:, t, :]) + bih else: us[k][t] = mult(self.params['Wh%d' % k], hs[k - 1][t]) if k == hps.recurrent_layer - 1: us[k][t] += mult(Whh, hprev) + bhh # Clip maximum activation mask = us[k][t] < hps.max_act us[k][t] = us[k][t] * mask + hps.max_act * (1 - mask) elif k != 0: us[k][t] += self.params['bh%d' % k] hs[k][t] = self.nl(us[k][t]) probs[t] = softmax(mult(Who, hs[-1][t]) + bho) self.last_h = list() for k in xrange(hps.hidden_layers): self.last_h.append(hs[k][-1]) if labels is None: return None, probs probs_neg_log = list() dprobs = list() for t in xrange(T): probs_neg_log.append(as_np(-1 * log(probs[t]))) dprobs.append(as_np(probs[t].copy())) for k in xrange(bsize): for t in xrange(len(labels[k])): costs[t, k] = probs_neg_log[t][labels[k][t], k] dprobs[t][labels[k][t], k] -= 1 for t in xrange(T): dprobs[t] = array(dprobs[t]) # NOTE Summing costs over time # NOTE FIXME Dividing by T to get better sense if objective # is decreasing, remove for grad checking cost = costs.sum() / bsize / float(T) if not back: return cost, probs # Backprop for k in self.grads: self.grads[k][:] = 0 for t in reversed(xrange(T)): self.grads['bho'] += dprobs[t][:, :].sum(axis=-1).reshape( (-1, 1)) / bsize self.grads['Who'] += mult(dprobs[t], hs[-1][t].T) / bsize for k in reversed(xrange(hps.hidden_layers)): if k == hps.hidden_layers - 1: dhs[k][t] += mult(Who.T, dprobs[t]) else: dhs[k][t] += mult(self.params['Wh%d' % (k + 1)].T, dhs[k + 1][t]) dus[k][t] += get_nl_grad(self.hps.nl, us[k][t]) * dhs[k][t] if k > 0: self.grads['Wh%d' % k] += mult(dus[k][t], hs[k - 1][t].T) / bsize self.grads['bh%d' % k] += dus[k][t].sum(axis=-1).reshape( (-1, 1)) / bsize if k == hps.recurrent_layer - 1: if t == 0: hprev = h0[k] self.grads['h0'][:, k] = mult( Whh.T, dus[k][t]).sum(axis=-1) / bsize else: hprev = hs[k][t - 1] dhs[k][t - 1] = mult(Whh.T, dus[k][t]) self.grads['Whh'] += mult(dus[k][t], hprev.T) / bsize self.grads['bhh'] += dus[k][t].sum(axis=-1).reshape( (-1, 1)) / bsize self.grads['Wih'] += mult(dus[0][t], data[:, t, :].T) / bsize self.grads['bih'] += dus[0][t].sum(axis=-1).reshape( (-1, 1)) / bsize return cost, self.grads