def get_gradients(self, X, Y, weights=1.0): cost = -(weights * self.log_prob(X, Y)).sum() params = Selector(self).get_parameters() gradients = OrderedDict() if isinstance(weights, float): for pname, param in params.iteritems(): gradients[param] = tensor.grad(cost, param, consider_constant=[X, Y]) else: for pname, param in params.iteritems(): gradients[param] = tensor.grad(cost, param, consider_constant=[X, Y, weights]) return gradients
def get_gradients(self, X, Y, weights=1.0): W_mean, W_ls, b_mean, b_ls = self.parameters mean, log_sigma = self.sample_expected(Y) sigma = tensor.exp(log_sigma) cost = -log_sigma - 0.5 * (X - mean) ** 2 / tensor.exp(2 * log_sigma) if weights != 1.0: cost = -weights.dimshuffle(0, "x") * cost cost_scaled = sigma ** 2 * cost cost_gscale = (sigma ** 2).sum(axis=1).dimshuffle([0, "x"]) cost_gscale = cost_gscale * cost gradients = OrderedDict() params = Selector(self.mlp).get_parameters() for pname, param in params.iteritems(): gradients[param] = tensor.grad(cost_gscale.sum(), param, consider_constant=[X, Y]) gradients[W_mean] = tensor.grad(cost_scaled.sum(), W_mean, consider_constant=[X, Y]) gradients[b_mean] = tensor.grad(cost_scaled.sum(), b_mean, consider_constant=[X, Y]) gradients[W_ls] = tensor.grad(cost_scaled.sum(), W_ls, consider_constant=[X, Y]) gradients[b_ls] = tensor.grad(cost_scaled.sum(), b_ls, consider_constant=[X, Y]) return gradients
def unify_parameters(self, source_id, dest_id): source = self.children[source_id] source_name = self.children[source_id].name source_prefix = '/' + source_name + '/' dest_name = self.children[dest_id].name dest_prefix = '/' + self.name + '/' + dest_name + '/' source_params = Selector(source).get_parameters() replaced = [] self.unified_parameters = [] for param, var in source_params.iteritems(): if not param.startswith(source_prefix): continue source_param = '/' + self.name + param param = param[len(source_prefix):] for unification in self.parameter_unifications_include: if unification.match(param): exclude = False for ex_unification in self.parameter_unifications_exclude: if ex_unification.match(param): exclude = True break if exclude: continue self.replace_parameter(dest_prefix + param, var) replaced += [dest_prefix + param] self.unified_parameters += [source_param] self.unified_parameters = self.convert_names_to_bricks( set(self.unified_parameters) | set(replaced)) return replaced
def get_gradients(self, X, Y, weights=1.): cost = -(weights * self.log_prob(X, Y)).sum() params = Selector(self).get_parameters() gradients = OrderedDict() if isinstance(weights, float): for pname, param in params.iteritems(): gradients[param] = tensor.grad(cost, param, consider_constant=[X, Y]) else: for pname, param in params.iteritems(): gradients[param] = tensor.grad( cost, param, consider_constant=[X, Y, weights]) return gradients
def get_gradients(self, features, n_samples): """Perform inference and calculate gradients. Returns ------- log_px : T.fvector log_psx : T.fvector gradients : OrderedDict """ p_layers = self.p_layers q_layers = self.q_layers n_layers = len(p_layers) batch_size = features.shape[0] x = replicate_batch(features, n_samples) # Get Q-samples samples, log_p, log_q = self.sample_q(x) # Reshape and sum samples = unflatten_values(samples, batch_size, n_samples) log_p = unflatten_values(log_p, batch_size, n_samples) log_q = unflatten_values(log_q, batch_size, n_samples) log_p_all = sum(log_p) log_q_all = sum(log_q) # Approximate log(p(x)) log_px = logsumexp(log_p_all - log_q_all, axis=-1) - tensor.log(n_samples) log_psx = (logsumexp((log_p_all - log_q_all) / 2, axis=-1) - tensor.log(n_samples)) * 2. # Approximate log p(x) and calculate IS weights w = self.importance_weights(log_p, log_q) wp = w.reshape((batch_size * n_samples, )) wq = w.reshape((batch_size * n_samples, )) wq = wq - (1. / n_samples) samples = flatten_values(samples, batch_size * n_samples) gradients = OrderedDict() for l in xrange(n_layers - 1): gradients = merge_gradients(gradients, p_layers[l].get_gradients(samples[l], samples[l + 1], weights=wp)) gradients = merge_gradients(gradients, q_layers[l].get_gradients(samples[l + 1], samples[l], weights=wq)) gradients = merge_gradients(gradients, p_layers[-1].get_gradients(samples[-1], weights=wp)) if (self.l1reg > 0.) or (self.l2reg > 0.): reg_gradients = OrderedDict() params = Selector(self).get_parameters() for pname, param in params.iteritems(): if has_roles(param, (WEIGHT,)): reg_cost = self.l1reg * tensor.sum(abs(param)) + self.l2reg * tensor.sum(param ** 2) reg_gradients[param] = tensor.grad(reg_cost, param) gradients = merge_gradients(gradients, reg_gradients) return log_px, log_psx, gradients
def get_gradients(self, features, n_samples): log_p_bound = self.log_likelihood_bound(features, n_samples) gradients = OrderedDict() params = Selector(self).get_parameters() for pname, param in params.iteritems(): cost = -log_p_bound.mean() + self.l2reg * tensor.sum(param ** 2) gradients[param] = tensor.grad(cost, param) return log_p_bound, gradients
def get_gradients(self, X, Y, weights=1.): W_mean, W_ls, b_mean, b_ls = self.parameters mean, log_sigma = self.sample_expected(Y) sigma = tensor.exp(log_sigma) cost = -log_sigma - 0.5 * (X - mean)**2 / tensor.exp(2 * log_sigma) if weights != 1.: cost = -weights.dimshuffle(0, 'x') * cost cost_scaled = sigma**2 * cost cost_gscale = (sigma**2).sum(axis=1).dimshuffle([0, 'x']) cost_gscale = cost_gscale * cost gradients = OrderedDict() params = Selector(self.mlp).get_parameters() for pname, param in params.iteritems(): gradients[param] = tensor.grad(cost_gscale.sum(), param, consider_constant=[X, Y]) gradients[W_mean] = tensor.grad(cost_scaled.sum(), W_mean, consider_constant=[X, Y]) gradients[b_mean] = tensor.grad(cost_scaled.sum(), b_mean, consider_constant=[X, Y]) gradients[W_ls] = tensor.grad(cost_scaled.sum(), W_ls, consider_constant=[X, Y]) gradients[b_ls] = tensor.grad(cost_scaled.sum(), b_ls, consider_constant=[X, Y]) return gradients
def get_gradients(self, features, n_samples): """Perform inference and calculate gradients. Returns ------- log_px : T.fvector log_psx : T.fvector gradients : OrderedDict """ p_layers = self.p_layers q_layers = self.q_layers n_layers = len(p_layers) batch_size = features.shape[0] x = replicate_batch(features, n_samples) # Get Q-samples samples, log_p, log_q = self.sample_q(x) # Reshape and sum samples = unflatten_values(samples, batch_size, n_samples) log_p = unflatten_values(log_p, batch_size, n_samples) log_q = unflatten_values(log_q, batch_size, n_samples) log_p_all = sum(log_p) log_q_all = sum(log_q) # Approximate log p(x) log_px_bound = log_p_all[:,0] - log_q_all[:,0] log_px = logsumexp(log_p_all-log_q_all, axis=-1) - tensor.log(n_samples) log_psx = (logsumexp((log_p_all-log_q_all)/2, axis=-1) - tensor.log(n_samples)) * 2. # Calculate IS weights w = self.importance_weights(log_p, log_q) wp = w.reshape( (batch_size*n_samples, ) ) wq = w.reshape( (batch_size*n_samples, ) ) wq = wq - (1./n_samples) samples = flatten_values(samples, batch_size*n_samples) gradients = OrderedDict() for l in xrange(n_layers-1): gradients = merge_gradients(gradients, p_layers[l].get_gradients(samples[l], samples[l+1], weights=wp)) gradients = merge_gradients(gradients, q_layers[l].get_gradients(samples[l+1], samples[l], weights=wq)) gradients = merge_gradients(gradients, p_layers[-1].get_gradients(samples[-1], weights=wp)) if (self.l1reg > 0.) or (self.l2reg > 0.): reg_gradients = OrderedDict() params = Selector(self).get_parameters() for pname, param in params.iteritems(): if has_roles(param, (WEIGHT,)): reg_cost = self.l1reg * tensor.sum(abs(param)) + self.l2reg * tensor.sum(param**2) reg_gradients[param] = tensor.grad(reg_cost, param) gradients = merge_gradients(gradients, reg_gradients) self.log_p_bound = log_px_bound self.log_p = log_px self.log_ph = log_psx return log_px, log_psx, gradients
print 'Parsing dataset file...' vocab = Vocab(dataset_path=args.dataset_path) source_sentence = tensor.lmatrix('source') encoder = BidirectionalEncoder(vocab.sequenceLength(), args.embed, args.nhidden) encoder.weights_init = IsotropicGaussian(args.weight_scale) encoder.biases_init = Constant(0) encoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() encoder.initialize() print 'Parameter names: ' enc_param_dict = Selector(encoder).get_params() for name, value in enc_param_dict.iteritems(): print ' {:15}: {}'.format(value.get_value().shape, name) representation = encoder.apply(source_sentence) print 'Compiling theano function' f = theano.function([source_sentence], representation) reps = np.empty(len(vocab.dataset), dtype=object) bar = Bar('Encoding', max=len(vocab.dataset)) for idx, sentence in enumerate(vocab.dataset): reps[idx] = f(sentence).transpose((1, 2, 0)) bar.next() bar.finish()