def _gradient_test(w, instance, model, choosen_dims = None): ''' The gradient test. Used to test if the gradient is correctly calculated. Detail of this method is described in Stochastic Gradient Descent Tricks by Bottou, L 1. Pick an example z 2. Compute the loss Q(z,w) 3. Compute the gradient g = D_w Q(z,w) 4. Apply a slightly pertubation to w'=w+\Delta. 5. Compute the new loss Q(z,w') and verify Q(z,w')=Q(z,w)+g\Delta ''' lossQ = _likelihood(w, instance, model) DlossQ = _dlikelihood(w, instance, model) build_instance(model.attrs, model.tags, instance, True) L = len(instance) T = len(model.tags) if not choosen_dims: U = instance.unigram_features_table B = instance.bigram_features_table features = [] for i in range(L): for j in range(T): if i == 0: features.extend(U[i,j]) else: for k in range(T): features.extend(U[i,j].tolist()) features.extend(B[k,j].tolist()) choosen_dims = random.sample(features, 5) epsilon = 1e-4 grad_diff = epsilon * DlossQ[choosen_dims].sum() w[choosen_dims] += epsilon lossQ2 = _likelihood(w, instance, model) if abs(lossQ2 - (lossQ + grad_diff)) > 1e-7: LOG(WARN, "Failed gradient test.") LOG(WARN, "Pertubation on %s dims." % str(choosen_dims)) LOG(WARN, "Loss before pertubation: %f" % lossQ) LOG(WARN, "Loss after pertubation: %f" % lossQ2) LOG(WARN, "Gradient difference: %f" % grad_diff) else: LOG(INFO, "Success gradient test.") w[choosen_dims] -= epsilon
def _gradient_test(w, instance, model, choosen_dims=None): ''' The gradient test. Used to test if the gradient is correctly calculated. Detail of this method is described in Stochastic Gradient Descent Tricks by Bottou, L 1. Pick an example z 2. Compute the loss Q(z,w) 3. Compute the gradient g = D_w Q(z,w) 4. Apply a slightly pertubation to w'=w+\Delta. 5. Compute the new loss Q(z,w') and verify Q(z,w')=Q(z,w)+g\Delta ''' lossQ = _likelihood(w, instance, model) DlossQ = _dlikelihood(w, instance, model) build_instance(model.attrs, model.tags, instance, True) L = len(instance) T = len(model.tags) if not choosen_dims: U = instance.unigram_features_table B = instance.bigram_features_table features = [] for i in range(L): for j in range(T): if i == 0: features.extend(U[i, j]) else: for k in range(T): features.extend(U[i, j].tolist()) features.extend(B[k, j].tolist()) choosen_dims = random.sample(features, 5) epsilon = 1e-4 grad_diff = epsilon * DlossQ[choosen_dims].sum() w[choosen_dims] += epsilon lossQ2 = _likelihood(w, instance, model) if abs(lossQ2 - (lossQ + grad_diff)) > 1e-7: LOG(WARN, "Failed gradient test.") LOG(WARN, "Pertubation on %s dims." % str(choosen_dims)) LOG(WARN, "Loss before pertubation: %f" % lossQ) LOG(WARN, "Loss after pertubation: %f" % lossQ2) LOG(WARN, "Gradient difference: %f" % grad_diff) else: LOG(INFO, "Success gradient test.") w[choosen_dims] -= epsilon
def _dlikelihood(w, instance, model): ''' Calculate gradient of a instance - param[in] w The weight vector - param[in] instance The instance - param[in] model The model ''' grad = zeros(w.shape[0], dtype=float) L = len(instance) T = model.nr_tags A = model.nr_attrs build_instance(model.attrs, model.tags, instance, True) g0, g = build_score_cache(w, L, T, A, instance) F = instance.correct_features for k, v in F.iteritems(): grad[k] += v a = forward(g0, g, L, T) # forward b = backward(g, L, T) # backward logZ = logsumexp(a[L - 1, :]) U = instance.unigram_features_table B = instance.bigram_features_table c = exp(g0 + b[0, :] - logZ).clip(0., 1.) for j in xrange(T): grad[U[0, j]] -= c[j] for i in xrange(1, L): c = exp(add.outer(a[i - 1, :], b[i, :]) + g[i, :, :] - logZ).clip( 0., 1.) # The following code is an equilism of this #for j in range(T): # for k in range(T): # grad[U[i,k]] -= c[j,k] # grad[B[j,k]] -= c[j,k] for k in range(T): grad[U[i, k]] -= c[:, k].sum() grad[range(A * T, (A + T) * T)] -= c.flatten() return grad
def _dlikelihood(w, instance, model): ''' Calculate gradient of a instance - param[in] w The weight vector - param[in] instance The instance - param[in] model The model ''' grad = zeros(w.shape[0], dtype=float) L = len(instance) T = model.nr_tags A = model.nr_attrs build_instance(model.attrs, model.tags, instance, True) g0, g = build_score_cache(w, L, T, A, instance) F = instance.correct_features for k, v in F.iteritems(): grad[k] += v a = forward(g0, g, L, T) # forward b = backward(g, L, T) # backward logZ = logsumexp(a[L-1,:]) U = instance.unigram_features_table B = instance.bigram_features_table c = exp(g0 + b[0,:] - logZ).clip(0., 1.) for j in xrange(T): grad[U[0,j]] -= c[j] for i in xrange(1, L): c = exp(add.outer(a[i-1,:], b[i,:]) + g[i,:,:] - logZ).clip(0.,1.) # The following code is an equilism of this #for j in range(T): # for k in range(T): # grad[U[i,k]] -= c[j,k] # grad[B[j,k]] -= c[j,k] for k in range(T): grad[U[i,k]] -= c[:,k].sum() grad[range(A*T, (A+T)*T)] -= c.flatten() return grad
def l2sgd(model, instances, nr_epoth, init_learning_rate, adjust_learning_rate = False): _sigma = 1. _gamma = init_learning_rate _t = 1. _eta = 0. samples = random.sample(instances, min(int(len(instances) * 0.1), 1000)) for epoth in xrange(nr_epoth): LOG(INFO, "Training epoth [%d]" % epoth) # randomly shuffle the training instances random.shuffle(instances) # loop over the training instances for index, instance in enumerate(instances): # first need to clear the cache build_instance(model.w, model.attrs, model.tags, instance) for k, v in expectation(model, instance).iteritems(): model.w[k] -= v * _gamma for k, v in instance.correct_features.iteritems(): model.w[k] += v * _gamma # re-calculate the scale _gamma = init_learning_rate / (1 + sqrt(float(epoth))) _t += 1. if (index + 1) % 1000 == 0: LOG(INFO, "%d instances is trained" % (index + 1)) destroy_instance(instance) LOG(INFO, "%d instances is trained" % (index + 1)) LOG(INFO, "Parameters norm %f" % norm(model.w))
def l2sgd(model, instances, nr_epoth, init_learning_rate, adjust_learning_rate=False): _sigma = 1. _gamma = init_learning_rate _t = 1. _eta = 0. samples = random.sample(instances, min(int(len(instances) * 0.1), 1000)) for epoth in xrange(nr_epoth): LOG(INFO, "Training epoth [%d]" % epoth) # randomly shuffle the training instances random.shuffle(instances) # loop over the training instances for index, instance in enumerate(instances): # first need to clear the cache build_instance(model.w, model.attrs, model.tags, instance) for k, v in expectation(model, instance).iteritems(): model.w[k] -= v * _gamma for k, v in instance.correct_features.iteritems(): model.w[k] += v * _gamma # re-calculate the scale _gamma = init_learning_rate / (1 + sqrt(float(epoth))) _t += 1. if (index + 1) % 1000 == 0: LOG(INFO, "%d instances is trained" % (index + 1)) destroy_instance(instance) LOG(INFO, "%d instances is trained" % (index + 1)) LOG(INFO, "Parameters norm %f" % norm(model.w))
def viterbi(model, instance): ''' ''' L = len(instance) T = model.nr_tags A = model.nr_attrs build_instance(model.attrs, model.tags, instance, False) g0, g = build_score_cache(model.w, L, T, A, instance) destroy_instance(instance) s, p = argmax(g0, g, L, T) v, i = s[L - 1].argmax(), L - 1 ret = [] while i >= 0: ret.append(v) v = p[i][v] i -= 1 ret.reverse() return ret
def viterbi(model, instance): ''' ''' L = len(instance) T = model.nr_tags A = model.nr_attrs build_instance(model.attrs, model.tags, instance, False) g0, g = build_score_cache(model.w, L, T, A, instance) destroy_instance(instance) s, p = argmax(g0, g, L, T) v, i = s[L -1].argmax(), L -1 ret = [] while i >= 0: ret.append(v) v = p[i][v] i -= 1 ret.reverse() return ret
def _likelihood(w, instance, model): ''' Calculate the likelihood of one instance - param[in] w - param[in] Instance - param[in] model ''' L = len(instance) T = model.nr_tags A = model.nr_attrs # Filling the correct_features and features_table build_instance(model.attrs, model.tags, instance, True) g0, g = build_score_cache(w, L, T, A, instance) # calcualte the correct likelihood F = instance.correct_features ret = array([w[k] * v for k, v in F.iteritems()]).sum() # calcualte the marginal a = forward(g0, g, L, T) return ret - logsumexp(a[L - 1, :])
def _likelihood(w, instance, model): ''' Calculate the likelihood of one instance - param[in] w - param[in] Instance - param[in] model ''' L = len(instance) T = model.nr_tags A = model.nr_attrs # Filling the correct_features and features_table build_instance(model.attrs, model.tags, instance, True) g0, g = build_score_cache(w, L, T, A, instance) # calcualte the correct likelihood F = instance.correct_features ret = array([w[k] * v for k, v in F.iteritems()]).sum() # calcualte the marginal a = forward(g0, g, L, T) return ret - logsumexp(a[L-1,:])