def batch_train(self, anc_idx, pos_idx, neg_idx, learn_rate=1e-3): """Perform a batch update of all parameters based on the given sets of anchor, positive example, and negative example indices. """ # Force incoming LUT indices to the right type (i.e. np.uint32) anc_idx = anc_idx.astype(np.uint32) pos_idx = pos_idx[:, np.newaxis] pn_idx = np.hstack((pos_idx, neg_idx)).astype(np.uint32) pn_sign = -1.0 * ones(pn_idx.shape) pn_sign[:, 0] = 1.0 L = zeros((1, )) # Do feedforward and backprop through the predictor/predictee tables w2v_ff_bp(anc_idx, pn_idx, pn_sign, self.params['Wa'], \ self.params['Wc'], self.params['b'], self.grads['Wa'], \ self.grads['Wc'], self.grads['b'], L, 1) L = L[0] # Apply gradients to (touched only) look-up-table parameters a_mod_idx = np.unique(anc_idx) c_mod_idx = np.unique(pn_idx) ag_update_2d(a_mod_idx, self.params['Wa'], self.grads['Wa'], \ self.moms['Wa'], learn_rate) ag_update_2d(c_mod_idx, self.params['Wc'], self.grads['Wc'], \ self.moms['Wc'], learn_rate) ag_update_1d(c_mod_idx, self.params['b'], self.grads['b'], \ self.moms['b'], learn_rate) return L
def apply_grad(self, learn_rate=1e-2): """Apply the current accumulated gradients, with adagrad.""" nz_idx = np.asarray([i for i in self.grad_idx]).astype(np.uint32) ag_update_2d(nz_idx, self.params['W'], self.grads['W'], \ self.moms['W'], learn_rate) self.grad_idx = set() return
def batch_train(self, anc_idx, pos_idx, neg_idx, learn_rate=1e-3): """Perform a batch update of all parameters based on the given sets of anchor, positive example, and negative example indices. """ # Force incoming LUT indices to the right type (i.e. np.uint32) anc_idx = anc_idx.astype(np.uint32) pos_idx = pos_idx[:,np.newaxis] pn_idx = np.hstack((pos_idx, neg_idx)).astype(np.uint32) pn_sign = -1.0 * ones(pn_idx.shape) pn_sign[:,0] = 1.0 L = zeros((1,)) # Do feedforward and backprop through the predictor/predictee tables w2v_ff_bp(anc_idx, pn_idx, pn_sign, self.params['Wa'], \ self.params['Wc'], self.params['b'], self.grads['Wa'], \ self.grads['Wc'], self.grads['b'], L, 1) L = L[0] # Apply gradients to (touched only) look-up-table parameters a_mod_idx = np.unique(anc_idx) c_mod_idx = np.unique(pn_idx) ag_update_2d(a_mod_idx, self.params['Wa'], self.grads['Wa'], \ self.moms['Wa'], learn_rate) ag_update_2d(c_mod_idx, self.params['Wc'], self.grads['Wc'], \ self.moms['Wc'], learn_rate) ag_update_1d(c_mod_idx, self.params['b'], self.grads['b'], \ self.moms['b'], learn_rate) return L
def apply_grad(self, learn_rate=1e-2): """Apply the current accumulated gradients, with adagrad.""" nz_idx = self.grad_idx[self.grad_idx < self.key_count] ag_update_2d(nz_idx, self.params['W'], self.grads['W'], \ self.moms['W'], learn_rate) ag_update_1d(nz_idx, self.params['b'], self.grads['b'], \ self.moms['b'], learn_rate) self.grad_idx = [] return
def apply_grad(self, learn_rate=1e-2): """Apply the current accumulated gradients, with adagrad.""" nz_idx = np.asarray([i for i in self.grad_idx]).astype(np.uint32) # Information from the word LUT should not pass through this # layer when source_dim < 5. In this case, we assume that we # will do prediction using only the context-adaptive biases. if self.do_rescale: m_rate = learn_rate if (self.source_dim >= 5) else 0.0 ag_update_2d(nz_idx, self.params['Wm'], self.grads['Wm'], \ self.moms['Wm'], m_rate) # No context-adaptive bias term should be applied if self.bias_dim # is < 5. I.e. only information coming up from the word LUT, and # possibly rescaled by this layer, should be used in prediction. b_rate = learn_rate if (self.bias_dim >= 5) else 0.0 ag_update_2d(nz_idx, self.params['Wb'], self.grads['Wb'], \ self.moms['Wb'], b_rate) self.grad_idx = set() return