def _setup_functions(self, X_sym, y_sym, X_mask, y_mask, layer_sizes): (input_variable, params, sz, input_size, hidden_sizes, output_size) = self._stack_layers(X_sym, X_mask, layer_sizes) output, output_params = build_linear_layer(sz, output_size, input_variable, self.random_state) params = params + output_params shp = output.shape output = output.reshape([shp[0] * shp[1], shp[2]]) y_hat_sym = T.nnet.softmax(output) y_sym_reshaped = y_sym.reshape([shp[0] * shp[1], shp[2]]) cost = -T.mean((y_sym_reshaped * T.log(y_hat_sym)).sum(axis=1)) grads = T.grad(cost, params) self.opt_ = self.optimizer(params) updates = self.opt_.updates( params, grads, self.learning_rate, self.momentum) self.fit_function = theano.function(inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, updates=updates, on_unused_input="ignore") self.loss_function = theano.function(inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, on_unused_input="ignore") self.predict_function = theano.function( inputs=[X_sym, X_mask], outputs=y_hat_sym, on_unused_input="ignore")
def _setup_functions(self, X_sym, y_sym, X_mask, y_mask, layer_sizes): (input_variable, params, sz, input_size, hidden_sizes, output_size) = self._stack_layers(X_sym, X_mask, layer_sizes) output, output_params = build_linear_layer(sz, output_size, input_variable, self.random_state) params = params + output_params shp = output.shape output = output.reshape([shp[0] * shp[1], shp[2]]) y_hat_sym = T.nnet.softmax(output) y_sym_reshaped = y_sym.reshape([shp[0] * shp[1], shp[2]]) cost = -T.mean((y_sym_reshaped * T.log(y_hat_sym)).sum(axis=1)) grads = T.grad(cost, params) self.opt_ = self.optimizer(params) updates = self.opt_.updates(params, grads, self.learning_rate, self.momentum) self.fit_function = theano.function( inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, updates=updates, on_unused_input="ignore") self.loss_function = theano.function( inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, on_unused_input="ignore") self.predict_function = theano.function(inputs=[X_sym, X_mask], outputs=y_hat_sym, on_unused_input="ignore")
def _setup_functions(self, X_sym, y_sym, X_mask, y_mask, layer_sizes): recurrent_sizes = layer_sizes[:-1] input_variable, params = stack_forward_layers( X_sym, X_mask, recurrent_sizes, build_recurrent_lstm_layer, self.random_state) sz = recurrent_sizes[-1] mu, mu_params = build_linear_layer( sz, self.n_mixture_components * self.n_features, input_variable, self.random_state) params = params + mu_params var, var_params = build_linear_layer( sz, self.n_mixture_components * self.n_features, input_variable, self.random_state) params = params + var_params coeff, coeff_params = build_linear_layer( sz, self.n_mixture_components, input_variable, self.random_state) params = params + coeff_params mu_shp = mu.shape var_shp = var.shape coeff_shp = coeff.shape y_shp = y_sym.shape # TODO: Masking! # Reshape everything to 2D coeff = coeff.reshape([coeff_shp[0] * coeff_shp[1], coeff_shp[2]]) coeff = T.nnet.softmax(coeff) y_r = y_sym.reshape([y_shp[0] * y_shp[1], y_shp[2]]) mu = mu.reshape([mu_shp[0] * mu_shp[1], mu_shp[2]]) var = var.reshape([var_shp[0] * var_shp[1], var_shp[2]]) # Reshape using 2D shapes... y_r = y_r.dimshuffle(0, 1, 'x') mu = mu.reshape([mu.shape[0], T.cast(mu.shape[1] / coeff.shape[-1], 'int32'), coeff.shape[-1]]) var = var.reshape([var.shape[0], T.cast(var.shape[1] / coeff.shape[-1], 'int32'), coeff.shape[-1]]) # Calculate GMM cost with minimum tolerance log_var = T.log(T.nnet.softplus(var) + 1E-15) cost = -0.5 * T.sum(T.sqr(y_r - mu) * T.exp(-log_var) + log_var + T.log(2 * np.pi), axis=1) cost = -logsumexp(T.log(coeff) + cost, axis=1).sum() grads = T.grad(cost, params) self.opt_ = self.optimizer(params) updates = self.opt_.updates( params, grads, self.learning_rate, self.momentum) self.fit_function = theano.function(inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, updates=updates, on_unused_input="ignore") self.loss_function = theano.function(inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, on_unused_input="ignore") self.generate_function = theano.function(inputs=[X_sym, X_mask], outputs=[mu, log_var, coeff], on_unused_input="ignore")
def _setup_functions(self, X_sym, y_sym, X_mask, y_mask, layer_sizes): recurrent_sizes = layer_sizes[:-1] input_variable, params = stack_forward_layers( X_sym, X_mask, recurrent_sizes, build_recurrent_lstm_layer, self.random_state) sz = recurrent_sizes[-1] # Hardcoded, works for 3 dims/ handwriting *only*! # Up/down channel binary, binary_params = build_linear_layer( sz, 1, input_variable, self.random_state) params = params + binary_params # Means mu, mu_params = build_linear_layer( sz, self.n_mixture_components * 2, input_variable, self.random_state) params = params + mu_params # Diagonal var, var_params = build_linear_layer( sz, self.n_mixture_components * 2, input_variable, self.random_state) params = params + var_params # Off-diagonal corr, corr_params = build_linear_layer( sz, self.n_mixture_components * 1, input_variable, self.random_state) params = params + corr_params coeff, coeff_params = build_linear_layer( sz, self.n_mixture_components, input_variable, self.random_state) params = params + coeff_params mu_shp = mu.shape var_shp = var.shape corr_shp = corr.shape coeff_shp = coeff.shape y_shp = y_sym.shape # TODO: Masking! # Reshape everything to 2D coeff = coeff.reshape([coeff_shp[0] * coeff_shp[1], coeff_shp[2]]) coeff = T.nnet.softmax(coeff) y_r = y_sym.reshape([y_shp[0] * y_shp[1], y_shp[2]]) y_b = y_r[:, 0] y_r = y_r[:, 1:] mu = mu.reshape([mu_shp[0] * mu_shp[1], mu_shp[2]]) var = var.reshape([var_shp[0] * var_shp[1], var_shp[2]]) corr = corr.reshape([corr_shp[0] * corr_shp[1], corr_shp[2]]) log_var = T.log(T.nnet.softplus(var) + 1E-9) # Negative due to sigmoid? AG paper has positive exponential binary = T.nnet.sigmoid(-binary) corr = T.tanh(corr) binary = binary.ravel() # Reshape using 2D shapes... y_r = y_r.dimshuffle(0, 1, 'x') mu = mu.reshape([mu.shape[0], T.cast(mu.shape[1] / coeff.shape[-1], 'int32'), coeff.shape[-1]]) log_var = log_var.reshape([log_var.shape[0], T.cast(log_var.shape[1] / coeff.shape[-1], 'int32'), coeff.shape[-1]]) corr = corr.reshape([corr.shape[0], T.cast(corr.shape[1] / coeff.shape[-1], 'int32'), coeff.shape[-1]]) # Exact AG cost - see the paper "Generating Sequences with Recurrent # Neural Networks", Alex Graves # http://arxiv.org/pdf/1308.0850v5.pdf x1 = X_sym[:, :, 1] x1 = T.addbroadcast(x1, 1) x2 = X_sym[:, :, 2] x2 = T.addbroadcast(x2, 1) mu1 = mu[:, 0, :] mu2 = mu[:, 1, :] log_var1 = log_var[:, 0, :] log_var2 = log_var[:, 1, :] # Binary cost c_b = -y_b * T.log(binary + 1E-9) - (1 - y_b) * T.log(1 - binary + 1E-9) # First part of log Gaussian c_g1 = -T.log(2 * np.pi) - log_var1 - log_var2 - .5 * T.log( 1 - T.sum(corr, axis=1) ** 2 + 1E-9) # Multiplier on z c_g2 = -.5 * 1. / (1 - T.sum(corr, axis=1) ** 2) z = (x1 - mu1) ** 2 / T.exp(log_var1) ** 2 z += (x2 - mu2) ** 2 / T.exp(log_var2) ** 2 z -= 2 * T.sum(corr, axis=1) * (x1 - mu1) * (x2 - mu2) / ( T.exp(log_var1) * T.exp(log_var2)) cost = c_g1 + c_g2 * z cost = T.sum(-logsumexp(T.log(coeff) + cost, axis=1) + c_b) grads = T.grad(cost, params) self.opt_ = self.optimizer(params) updates = self.opt_.updates( params, grads, self.learning_rate, self.momentum) self.fit_function = theano.function(inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, updates=updates, on_unused_input="ignore") self.loss_function = theano.function(inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, on_unused_input="ignore") self.generate_function = theano.function(inputs=[X_sym, X_mask], outputs=[binary, mu, log_var, corr, coeff], on_unused_input="ignore")
def _setup_functions(self, X_sym, y_sym, X_mask, y_mask, layer_sizes): recurrent_sizes = layer_sizes[:-1] input_variable, params = stack_forward_layers( X_sym, X_mask, recurrent_sizes, build_recurrent_lstm_layer, self.random_state) sz = recurrent_sizes[-1] mu, mu_params = build_linear_layer( sz, self.n_mixture_components * self.n_features, input_variable, self.random_state) params = params + mu_params var, var_params = build_linear_layer( sz, self.n_mixture_components * self.n_features, input_variable, self.random_state) params = params + var_params coeff, coeff_params = build_linear_layer(sz, self.n_mixture_components, input_variable, self.random_state) params = params + coeff_params mu_shp = mu.shape var_shp = var.shape coeff_shp = coeff.shape y_shp = y_sym.shape # TODO: Masking! # Reshape everything to 2D coeff = coeff.reshape([coeff_shp[0] * coeff_shp[1], coeff_shp[2]]) coeff = T.nnet.softmax(coeff) y_r = y_sym.reshape([y_shp[0] * y_shp[1], y_shp[2]]) mu = mu.reshape([mu_shp[0] * mu_shp[1], mu_shp[2]]) var = var.reshape([var_shp[0] * var_shp[1], var_shp[2]]) # Reshape using 2D shapes... y_r = y_r.dimshuffle(0, 1, 'x') mu = mu.reshape([ mu.shape[0], T.cast(mu.shape[1] / coeff.shape[-1], 'int32'), coeff.shape[-1] ]) var = var.reshape([ var.shape[0], T.cast(var.shape[1] / coeff.shape[-1], 'int32'), coeff.shape[-1] ]) # Calculate GMM cost with minimum tolerance log_var = T.log(T.nnet.softplus(var) + 1E-15) cost = -0.5 * T.sum( T.sqr(y_r - mu) * T.exp(-log_var) + log_var + T.log(2 * np.pi), axis=1) cost = -logsumexp(T.log(coeff) + cost, axis=1).sum() grads = T.grad(cost, params) self.opt_ = self.optimizer(params) updates = self.opt_.updates(params, grads, self.learning_rate, self.momentum) self.fit_function = theano.function( inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, updates=updates, on_unused_input="ignore") self.loss_function = theano.function( inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, on_unused_input="ignore") self.generate_function = theano.function(inputs=[X_sym, X_mask], outputs=[mu, log_var, coeff], on_unused_input="ignore")
def _setup_functions(self, X_sym, y_sym, X_mask, y_mask, layer_sizes): recurrent_sizes = layer_sizes[:-1] input_variable, params = stack_forward_layers( X_sym, X_mask, recurrent_sizes, build_recurrent_lstm_layer, self.random_state) sz = recurrent_sizes[-1] # Hardcoded, works for 3 dims/ handwriting *only*! # Up/down channel binary, binary_params = build_linear_layer(sz, 1, input_variable, self.random_state) params = params + binary_params # Means mu, mu_params = build_linear_layer(sz, self.n_mixture_components * 2, input_variable, self.random_state) params = params + mu_params # Diagonal var, var_params = build_linear_layer(sz, self.n_mixture_components * 2, input_variable, self.random_state) params = params + var_params # Off-diagonal corr, corr_params = build_linear_layer(sz, self.n_mixture_components * 1, input_variable, self.random_state) params = params + corr_params coeff, coeff_params = build_linear_layer(sz, self.n_mixture_components, input_variable, self.random_state) params = params + coeff_params mu_shp = mu.shape var_shp = var.shape corr_shp = corr.shape coeff_shp = coeff.shape y_shp = y_sym.shape # TODO: Masking! # Reshape everything to 2D coeff = coeff.reshape([coeff_shp[0] * coeff_shp[1], coeff_shp[2]]) coeff = T.nnet.softmax(coeff) y_r = y_sym.reshape([y_shp[0] * y_shp[1], y_shp[2]]) y_b = y_r[:, 0] y_r = y_r[:, 1:] mu = mu.reshape([mu_shp[0] * mu_shp[1], mu_shp[2]]) var = var.reshape([var_shp[0] * var_shp[1], var_shp[2]]) corr = corr.reshape([corr_shp[0] * corr_shp[1], corr_shp[2]]) log_var = T.log(T.nnet.softplus(var) + 1E-9) # Negative due to sigmoid? AG paper has positive exponential binary = T.nnet.sigmoid(-binary) corr = T.tanh(corr) binary = binary.ravel() # Reshape using 2D shapes... y_r = y_r.dimshuffle(0, 1, 'x') mu = mu.reshape([ mu.shape[0], T.cast(mu.shape[1] / coeff.shape[-1], 'int32'), coeff.shape[-1] ]) log_var = log_var.reshape([ log_var.shape[0], T.cast(log_var.shape[1] / coeff.shape[-1], 'int32'), coeff.shape[-1] ]) corr = corr.reshape([ corr.shape[0], T.cast(corr.shape[1] / coeff.shape[-1], 'int32'), coeff.shape[-1] ]) # Exact AG cost - see the paper "Generating Sequences with Recurrent # Neural Networks", Alex Graves # http://arxiv.org/pdf/1308.0850v5.pdf x1 = X_sym[:, :, 1] x1 = T.addbroadcast(x1, 1) x2 = X_sym[:, :, 2] x2 = T.addbroadcast(x2, 1) mu1 = mu[:, 0, :] mu2 = mu[:, 1, :] log_var1 = log_var[:, 0, :] log_var2 = log_var[:, 1, :] # Binary cost c_b = -y_b * T.log(binary + 1E-9) - (1 - y_b) * T.log(1 - binary + 1E-9) # First part of log Gaussian c_g1 = -T.log(2 * np.pi) - log_var1 - log_var2 - .5 * T.log( 1 - T.sum(corr, axis=1)**2 + 1E-9) # Multiplier on z c_g2 = -.5 * 1. / (1 - T.sum(corr, axis=1)**2) z = (x1 - mu1)**2 / T.exp(log_var1)**2 z += (x2 - mu2)**2 / T.exp(log_var2)**2 z -= 2 * T.sum(corr, axis=1) * (x1 - mu1) * (x2 - mu2) / ( T.exp(log_var1) * T.exp(log_var2)) cost = c_g1 + c_g2 * z cost = T.sum(-logsumexp(T.log(coeff) + cost, axis=1) + c_b) grads = T.grad(cost, params) self.opt_ = self.optimizer(params) updates = self.opt_.updates(params, grads, self.learning_rate, self.momentum) self.fit_function = theano.function( inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, updates=updates, on_unused_input="ignore") self.loss_function = theano.function( inputs=[X_sym, y_sym, X_mask, y_mask], outputs=cost, on_unused_input="ignore") self.generate_function = theano.function( inputs=[X_sym, X_mask], outputs=[binary, mu, log_var, corr, coeff], on_unused_input="ignore")