def generation_simple(z_list, n_latent, n_out, y): logger.info('generate output without MLP') hid_to_out = Linear(name='hidDecoder_to_output', input_dim=n_latent, output_dim=n_out) initialize([hid_to_out]) mysigmoid = Logistic(name='y_hat_vae') agg_logpy_xz = 0. agg_y_hat = 0. for z in z_list: lin_out = hid_to_out.apply(z) y_hat = mysigmoid.apply(lin_out) #reconstructed x logpy_xz = -cross_entropy_loss(y_hat, y) agg_logpy_xz += logpy_xz agg_y_hat += y_hat agg_logpy_xz /= len(z_list) agg_y_hat /= len(z_list) return agg_y_hat, agg_logpy_xz
def softmax_layer_old(h, y, hidden_size, num_targets, cost_fn='softmax'): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=num_targets) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' y_pred = T.argmax(linear_output, axis=1) label_of_predicted = debug_print(y[T.arange(y.shape[0]), y_pred], 'label_of_predicted', False) pat1 = T.mean(label_of_predicted) updates = {} if 'softmax' in cost_fn: y_hat = Logistic().apply(linear_output) y_hat.name = 'y_hat' cost = cross_entropy_loss(y_hat, y) else: cost, updates = ranking_loss(linear_output, y) cost.name = 'cost' pat1.name = 'precision@1' return cost, pat1, updates
def generation(z_list, n_latent, hu_decoder, n_out, y): logger.info('in generation: n_latent: %d, hu_decoder: %d', n_latent, hu_decoder) if hu_decoder == 0: return generation_simple(z_list, n_latent, n_out, y) mlp1 = MLP(activations=[Rectifier()], dims=[n_latent, hu_decoder], name='latent_to_hidDecoder') initialize([mlp1]) hid_to_out = Linear(name='hidDecoder_to_output', input_dim=hu_decoder, output_dim=n_out) initialize([hid_to_out]) mysigmoid = Logistic(name='y_hat_vae') agg_logpy_xz = 0. agg_y_hat = 0. for i, z in enumerate(z_list): y_hat = mysigmoid.apply(hid_to_out.apply(mlp1.apply(z))) #reconstructed x agg_logpy_xz += cross_entropy_loss(y_hat, y) agg_y_hat += y_hat agg_logpy_xz /= len(z_list) agg_y_hat /= len(z_list) return agg_y_hat, agg_logpy_xz
def softmax_layer(h, y, hidden_size, num_targets, cost_fn='cross'): hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_size, output_dim=num_targets) initialize([hidden_to_output]) linear_output = hidden_to_output.apply(h) linear_output.name = 'linear_output' y_pred = T.argmax(linear_output, axis=1) label_of_predicted = debug_print(y[T.arange(y.shape[0]), y_pred], 'label_of_predicted', False) pat1 = T.mean(label_of_predicted) updates = None if 'ranking' in cost_fn: cost, updates = ranking_loss(linear_output, y) print 'using ranking loss function!' else: y_hat = Logistic().apply(linear_output) y_hat.name = 'y_hat' cost = cross_entropy_loss(y_hat, y) cost.name = 'cost' pat1.name = 'precision@1' misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(linear_output, 0.5)) misclassify_rate.name = 'error_rate' return cost, pat1, updates, misclassify_rate
def __init__(self, dim, activation=None, gate_activation=None, **kwargs): self.dim = dim if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation children = [self.activation, self.gate_activation] kwargs.setdefault('children', []).extend(children) super(LSTMConv, self).__init__(**kwargs)
def __init__(self, dim, activation=None, gate_activation=None, **kwargs): self.dim = dim if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation children = ([self.activation, self.gate_activation] + kwargs.get('children', [])) super(LSTM, self).__init__(children=children, **kwargs)
def __init__(self, dim, activation=None, gate_activation=None, **kwargs): super(GatedRecurrent, self).__init__(**kwargs) self.dim = dim if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation self.children = [activation, gate_activation]
def test_activations(): x = tensor.vector() x_val = numpy.random.rand(8).astype(theano.config.floatX) exp_x_val = numpy.exp(x_val) assert_allclose(x_val, Identity().apply(x).eval({x: x_val})) assert_allclose(numpy.tanh(x_val), Tanh().apply(x).eval({x: x_val}), rtol=1e-06) assert_allclose(numpy.log(1 + exp_x_val), Softplus(x).apply(x).eval({x: x_val}), rtol=1e-6) assert_allclose(exp_x_val / numpy.sum(exp_x_val), Softmax(x).apply(x).eval({x: x_val}).flatten(), rtol=1e-6) assert_allclose(1.0 / (1.0 + numpy.exp(-x_val)), Logistic(x).apply(x).eval({x: x_val}), rtol=1e-6)
def __init__(self, x_dim, hidden_layers, hidden_act, z_dim, **kwargs): super(DVAE, self).__init__([], [], **kwargs) inits = { #'weights_init': IsotropicGaussian(std=0.1), 'weights_init': RWSInitialization(factor=1.), 'biases_init': Constant(0.0), } hidden_act = [hidden_act] * len(hidden_layers) q_mlp = BatchNormalizedMLP(hidden_act + [Logistic()], [x_dim] + hidden_layers + [z_dim], **inits) #q_mlp = MLP(hidden_act+[Logistic()], [x_dim]+hidden_layers+[z_dim], **inits) p_mlp = BatchNormalizedMLP(hidden_act + [Logistic()], [z_dim] + hidden_layers + [x_dim], **inits) #p_mlp = MLP(hidden_act+[Logistic()], [z_dim]+hidden_layers+[x_dim], **inits) self.q = BernoulliLayer(q_mlp, name="q") self.p = BernoulliLayer(p_mlp, name="p") self.p_top = BernoulliTopLayer(z_dim, biases_init=Constant(0.0)) self.children = [self.p_top, self.p, self.q]
def __init__(self, image_dimension, **kwargs): layers = [] ############################################# # a first block with 2 convolutions of 32 (3, 3) filters layers.append(Convolutional((3, 3), 32, border_mode='half')) layers.append(Rectifier()) layers.append(Convolutional((3, 3), 32, border_mode='half')) layers.append(Rectifier()) # maxpool with size=(2, 2) layers.append(MaxPooling((2, 2))) ############################################# # a 2nd block with 3 convolutions of 64 (3, 3) filters layers.append(Convolutional((3, 3), 64, border_mode='half')) layers.append(Rectifier()) layers.append(Convolutional((3, 3), 64, border_mode='half')) layers.append(Rectifier()) layers.append(Convolutional((3, 3), 64, border_mode='half')) layers.append(Rectifier()) # maxpool with size=(2, 2) layers.append(MaxPooling((2, 2))) ############################################# # a 3rd block with 4 convolutions of 128 (3, 3) filters layers.append(Convolutional((3, 3), 128, border_mode='half')) layers.append(Rectifier()) layers.append(Convolutional((3, 3), 128, border_mode='half')) layers.append(Rectifier()) layers.append(Convolutional((3, 3), 128, border_mode='half')) layers.append(Rectifier()) layers.append(Convolutional((3, 3), 128, border_mode='half')) layers.append(Rectifier()) # maxpool with size=(2, 2) layers.append(MaxPooling((2, 2))) self.conv_sequence = ConvolutionalSequence(layers, 3, image_size=image_dimension) flattener = Flattener() self.top_mlp = BatchNormalizedMLP(activations=[Rectifier(), Logistic()], dims=[500, 1]) application_methods = [self.conv_sequence.apply, flattener.apply, self.top_mlp.apply] super(VGGNet, self).__init__(application_methods, biases_init=Constant(0), weights_init=Uniform(width=.1), **kwargs)
def create_base_model(self, x, y, input_dim): # Create the output of the MLP mlp = MLP([Tanh(), Tanh(), Logistic()], [input_dim, 100, 100, 1], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0)) mlp.initialize() probs = mlp.apply(x) #sq_err = SquaredError() cost = T.mean(T.sqr(y.flatten() - probs.flatten())) cost.name = 'cost' pred_out = probs > 0.5 mis_cost = T.mean(T.neq(y.flatten(), pred_out.flatten())) mis_cost.name = 'MisclassificationRate' return mlp, cost, mis_cost
def __init__(self, dim, activation=None, gate_activation=None, **kwargs): self.dim = dim self.recurrent_weights_init = None self.initial_states_init = None if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation children = [activation, gate_activation] + kwargs.get('children', []) super(GatedRecurrent, self).__init__(children=children, **kwargs)
def create_model(self, x, y, input_dim, p): # Create the output of the MLP mlp = MLP([Tanh(), Tanh(), Logistic()], [input_dim, 200, 100, 1], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() probs = mlp.apply(x).sum() # Create the if-else cost function reward = (probs * y * 1.0) / p + (1 - probs) * (1 - y) * 1.0 / (1 - p) cost = -reward # Negative of reward cost.name = "cost" return mlp, cost
def __init__(self, **kwargs): children = [] self.list_simple_joints = [19, 20, 8, 7] + range(16,19)[::-1] + range(4, 7)[::-1] + [1] + [15, 3, 2] + [0] self.simple_joints = {} for simple_joint_idx in self.list_simple_joints: self.simple_joints[simple_joint_idx] = [Convolutional(filter_size=(1,1), num_filters = 512, border_mode = (0,0), use_bias=True, tied_biases=True, name='fconv_' + str(simple_joint_idx), biases_init=Constant(0.), weights_init=IsotropicGaussian(0.01), num_channels = 512), \ Rectifier(name='fconv_relu' + str(simple_joint_idx)), \ Convolutional(filter_size=(1,1), num_filters = 1, border_mode = (0,0), use_bias=True, tied_biases=True, name='fconv1_' + str(simple_joint_idx), biases_init=Constant(0.), weights_init=IsotropicGaussian(0.01), num_channels = 512), \ Logistic(name = 'flogistic_' + str(simple_joint_idx))] children += self.simple_joints[simple_joint_idx] kwargs.setdefault('children', []).extend(children) super(top_direction_block, self).__init__(**kwargs)
def __init__(self, dim, activation=None, gate_activation=None, model_type=6, ogates_zoneout=False, **kwargs): self.dim = dim self.model_type = model_type self.ogates_zoneout = ogates_zoneout if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation children = [self.activation, self.gate_activation] kwargs.setdefault('children', []).extend(children) super(ZoneoutLSTM, self).__init__(**kwargs)
def __init__(self, mlp, const=1e-5, **kwargs): super(PitchGaussianEmitter, self).__init__(**kwargs) self.mlp = mlp input_dim = self.mlp.output_dim self.const = const self.mu = MLP(activations=[Identity()], dims=[input_dim, 1], name=self.name + "_mu") self.sigma = MLP(activations=[SoftPlus()], dims=[input_dim, 1], name=self.name + "_sigma") self.binary = MLP(activations=[Logistic()], dims=[input_dim, 1], name=self.name + "_binary") self.children = [self.mlp, self.mu, self.sigma, self.binary]
def build_network(self, num_labels, features, max_len=None, hidden_units=None, l2=None, use_cnn=None, cnn_filter_size=None, cnn_pool_size=None, cnn_num_filters=None, cnn_filter_sizes=None, embedding_size=None, DEBUG=False): """ Build the neural network used for training. :param num_labels: Number of labels to classify :param features: the input features we use :param max_len: Configured window-size :param hidden_units: Number of units in the MLP's hiddden layer :returns: The cost function, the misclassification rate function, the computation graph of the cost function and the prediction function """ logger.info( 'building the network, with one CNN for left and one for right') hidden_units = hidden_units or self._config['hidden_units'] logger.info('#hidden units: %d', hidden_units) # building the feature vector from input. mlp_in_e1, mlp_in_e2, mlp_in_dim = self.build_feature_vector_noMention( features) logger.info('feature vector size: %d', mlp_in_dim) mlp = MLP(activations=[Rectifier()], dims=[mlp_in_dim, hidden_units], seed=self.curSeed) initialize([mlp]) before_out_e1 = mlp.apply(mlp_in_e1) before_out_e2 = mlp.apply(mlp_in_e2) hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_units, output_dim=num_labels) initialize([hidden_to_output]) linear_output_e1 = hidden_to_output.apply(before_out_e1) linear_output_e2 = hidden_to_output.apply(before_out_e2) linear_output_e1.name = 'linear_output_e1' linear_output_e2.name = 'linear_output_e2' y_hat_e1 = Logistic(name='logistic1').apply(linear_output_e1) y_hat_e2 = Logistic(name='logistic2').apply(linear_output_e2) y_hat_e1.name = 'y_hat_e1' y_hat_e2.name = 'y_hat_e2' y_hat_e1 = debug_print(y_hat_e1, 'y_1', DEBUG) return y_hat_e1, y_hat_e2, before_out_e1, before_out_e2
def create_model(self, x, y, input_dim, p): # Create the output of the MLP mlp = MLP( [Rectifier(), Rectifier(), Logistic()], [input_dim, 150, 100, 1], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() probs = 1 - mlp.apply(x) y = y.dimshuffle(0, 'x') # Create the if-else cost function pos_ex = (y * probs) / p neg_ex = (1 - y) * (1 - probs) / np.float32(1 - p) reward = pos_ex + neg_ex cost = reward # Negative of reward cost.name = "cost" return mlp, cost
def create_model(self): x = self.x y = self.y input_dim = self.input_dim p = self.p mlp = MLP( [Rectifier(), Rectifier(), Logistic()], [input_dim, 100, 80, 1], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0)) mlp.initialize() self.mlp = mlp probs = mlp.apply(x) probs.name = "score" y = y.dimshuffle(0, 'x') # Create the if-else cost function pos_ex = (y * probs) / p neg_ex = (1 - y) * (1 - probs) / np.float32(1 - p) reward = pos_ex + neg_ex cost = reward # Negative of reward cost.name = "cost" return cost, probs
def __init__(self, dim, model_type, update_prob, activation=None, gate_activation=None, **kwargs): self.dim = dim self.model_type = model_type self.update_prob = update_prob if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation children = [self.activation, self.gate_activation] kwargs.setdefault('children', []).extend(children) super(DropLSTM, self).__init__(**kwargs)
def __init__(self, x_dim, hidden_layers, hidden_act, z_dim, batch_norm=False, **kwargs): super(VAE, self).__init__([], [], **kwargs) inits = { 'weights_init': IsotropicGaussian(std=0.1), #'weights_init': RWSInitialization(factor=1.), 'biases_init': Constant(0.0), } hidden_act = [hidden_act]*len(hidden_layers) assert batch_norm == False q_mlp = MLP(hidden_act , [x_dim]+hidden_layers, **inits) p_mlp = MLP(hidden_act+[Logistic()], [z_dim]+hidden_layers+[x_dim], **inits) self.q = GaussianLayer(z_dim, q_mlp, **inits) self.p = BernoulliLayer(p_mlp, **inits) self.prior_log_sigma = numpy.zeros(z_dim) # self.prior_mu = numpy.zeros(z_dim) # self.children = [self.p, self.q]
def test_activations(): x = tensor.vector() x_val = numpy.random.rand(8).astype(theano.config.floatX) exp_x_val = numpy.exp(x_val) assert_allclose(x_val, Identity().apply(x).eval({x: x_val})) assert_allclose(numpy.tanh(x_val), Tanh().apply(x).eval({x: x_val}), rtol=1e-06) assert_allclose(numpy.log(1 + exp_x_val), Softplus(x).apply(x).eval({x: x_val}), rtol=1e-6) assert_allclose(exp_x_val / numpy.sum(exp_x_val), Softmax(x).apply(x).eval({x: x_val}).flatten(), rtol=1e-6) assert_allclose(1.0 / (1.0 + numpy.exp(-x_val)), Logistic(x).apply(x).eval({x: x_val}), rtol=1e-6) leaky_out_1 = x_val - 0.5 leaky_out_1[leaky_out_1 < 0] *= 0.01 assert_allclose(leaky_out_1, LeakyRectifier().apply(x).eval({x: x_val - 0.5})) leaky_out_2 = x_val - 0.5 leaky_out_2[leaky_out_2 < 0] *= 0.05 assert_allclose(leaky_out_2, LeakyRectifier(leak=0.05).apply(x).eval({x: x_val - 0.5}))
def __init__(self, dim, attended_dim, activation=None, gate_activation=None, **kwargs): super(GRU, self).__init__(**kwargs) self.dim = dim self.attended_dim = attended_dim if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation self.initial_transformer = MLP(activations=[Tanh()], dims=[attended_dim, self.dim], name='state_initializer') self.children = [activation, gate_activation, self.initial_transformer]
def __init__(self, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoderSigmoid, self).__init__(**kwargs) self.embedding_dim = embedding_dim self.state_dim = state_dim curSeed = 1791095845 self.rng = numpy.random.RandomState(curSeed) self.bidir = BidirectionalWMT15( GatedRecurrentWithZerosAtMask(activation=Logistic(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') #self.children = [self.lookup, self.bidir, self.children = [self.bidir, self.fwd_fork, self.back_fork] self._push_allocation_config( ) # maybe not necessary? (maybe only necessary for decoder) print "RNN seed: " + str(self.rng.get_state()[1][0]) # initialization of parameters self.weights_init = IsotropicGaussian() self.biases_init = Constant(0) self.push_initialization_config() self.bidir.prototype.weights_init = Orthogonal() self.initialize()
def test_benoulli_layer(): # Setup layer dim_y = 50 dim_x = 100 mlp = MLP([Logistic()], [dim_y, dim_x], **inits) l = BernoulliLayer(mlp, name="layer", **inits) l.initialize() y = tensor.fmatrix('y') x_expected = l.sample_expected(y) x, x_log_prob = l.sample(y) do = theano.function([y], [x_expected, x, x_log_prob], allow_input_downcast=True) y = numpy.eye(50, dtype=numpy.float32) x_expected, x, x_log_prob = do(y) assert x_expected.shape == (50, dim_x) assert x.shape == (50, dim_x) assert x_log_prob.shape == (50, )
def build_model_new(fea2obj, num_targets, config, kl_weight, entropy_weight, deterministic=False, test=False): hidden_size = config['hidden_units'].split() use_highway = str_to_bool( config['use_highway']) if 'use_highway' in config else False use_gaus = str_to_bool( config['use_gaus']) if 'use_gaus' in config else False use_rec = str_to_bool(config['use_rec']) if 'use_rec' in config else True n_latent_z = int(config['n_latent']) if 'use_gaus' in config else 0 use_noise = str_to_bool( config['use_noise']) if 'use_noise' in config else False use_vae = str_to_bool(config['use_vae']) if 'use_vae' in config else False hu_decoder = int( config['hu_decoder']) if 'hu_decoder' in config else hidden_size logger.info( 'use_gaus: %s, use_rec: %s, use_noise: %s, use_vae: %s, hidden_size: %s, n_latent_z: %d, hu_decoder: %s, hu_encoder: %s', use_gaus, use_rec, use_noise, use_vae, hidden_size, n_latent_z, hu_decoder, hidden_size) init_with_type = str_to_bool( config['init_with_type']) if 'init_with_type' in config else False y = T.matrix('targets', dtype='int32') drop_prob = float(config['dropout']) if 'dropout' in config else 0 #build the feature vector with one model, e.g., with cnn or mean or lstm feature_vec, feature_vec_len = build_feature_vec(fea2obj, config) #drop out if drop_prob > 0: mask = T.cast( srng.binomial(n=1, p=1 - drop_prob, size=feature_vec.shape), 'float32') if test: feature_vec *= (1 - drop_prob) else: feature_vec *= mask #Highway network if use_highway: g_mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, feature_vec_len], name='g_mlp') t_mlp = MLP(activations=[Logistic()], dims=[feature_vec_len, feature_vec_len], name='t_mlp') initialize([g_mlp, t_mlp]) t = t_mlp.apply(feature_vec) z = t * g_mlp.apply(feature_vec) + (1. - t) * feature_vec feature_vec = z #MLP(s) logger.info('feature vec length = %s and hidden layer units = %s', feature_vec_len, ' '.join(hidden_size)) if len(hidden_size) > 1: #2 MLP on feature fector mlp = MLP( activations=[Rectifier(), Rectifier()], dims=[feature_vec_len, int(hidden_size[0]), int(hidden_size[1])], name='joint_mlp') initialize([mlp]) before_out = mlp.apply(feature_vec) last_hidden_size = int(hidden_size[1]) else: hidden_size = int(hidden_size[0]) mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, hidden_size], name='joint_mlp') initialize([mlp]) before_out = mlp.apply(feature_vec) last_hidden_size = hidden_size #compute y_hat initial guess hidden_to_output = Linear(name='hidden_to_output', input_dim=last_hidden_size, output_dim=num_targets) typemfile = None if init_with_type: typemfile = config['dsdir'] + '/_typematrix.npy' #typemfile = config['dsdir'] + '/_typeCooccurrMatrix.npy' initialize_lasthid(hidden_to_output, typemfile) # initialize([hidden_to_output]) y_hat_init = Logistic().apply(hidden_to_output.apply(before_out)) y_hat_init.name = 'y_hat_init' y_hat_init = debug_print(y_hat_init, 'yhat_init', False) logpy_xz_init = cross_entropy_loss(y_hat_init, y) logpy_xz = logpy_xz_init y_hat_recog = y_hat_init y_hat = y_hat_init KLD = 0 if use_gaus: if use_vae: logger.info('using VAE') vae_conditional = str_to_bool(config['vae_cond']) if vae_conditional: y_hat, logpy_xz, KLD, y_hat_recog = build_vae_conditoinal( kl_weight, entropy_weight, y_hat_init, feature_vec, feature_vec_len, config, y, test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder) else: y_hat, logpy_xz, KLD = build_vae_basic( kl_weight, feature_vec, feature_vec_len, config, y, test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder) y_hat_recog = y_hat else: if use_rec: logger.info('Not using VAE... but using recursion') prior_in = T.concatenate([feature_vec, y_hat_init], axis=1) mu_prior, log_sigma_prior = prior_network( x=prior_in, n_input=feature_vec_len + num_targets, hu_encoder=hidden_size, n_latent=n_latent_z) z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise) zl = [T.concatenate([z_prior, feature_vec], axis=1)] y_hat, logpy_xz = generation(zl, n_latent=n_latent_z + feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y) y_hat = (y_hat + y_hat_init) / 2. logpy_xz = (logpy_xz + logpy_xz_init) / 2. else: prior_in = T.concatenate([feature_vec], axis=1) mu_prior, log_sigma_prior = prior_network( x=prior_in, n_input=feature_vec_len, hu_encoder=hidden_size, n_latent=n_latent_z) z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise) zl = [T.concatenate([z_prior, feature_vec], axis=1)] y_hat, logpy_xz = generation(zl, n_latent=n_latent_z + feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y) y_hat_recog = y_hat y_hat = debug_print(y_hat, 'y_hat', False) pat1 = T.mean(y[T.arange(y.shape[0]), T.argmax(y_hat, axis=1)]) max_type = debug_print(T.argmax(y_hat_recog, axis=1), 'max_type', False) pat1_recog = T.mean(y[T.arange(y.shape[0]), max_type]) mean_cross = T.mean(logpy_xz) mean_kld = T.mean(KLD) cost = mean_kld + mean_cross cost.name = 'cost' mean_kld.name = 'kld' mean_cross.name = 'cross_entropy_loss' pat1.name = 'p@1' pat1_recog.name = 'p@1_recog' misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(y_hat, 0.5)) misclassify_rate.name = 'error_rate' return cost, pat1, y_hat, mean_kld, mean_cross, pat1_recog, misclassify_rate
def build_model_hard(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] for i in range(layers - 1): mlp = MLP(activations=[Logistic()], dims=[2 * state_dim, 1], weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear(input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have correctly: # h = [state_1, state_2, state_3 ...] # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
def main(args): """Run experiment. """ lr_tag = float_tag(args.learning_rate) x_dim, train_stream, valid_stream, test_stream = datasets.get_streams( args.data, args.batch_size) #------------------------------------------------------------ # Setup model deterministic_act = Tanh deterministic_size = 1. if args.method == 'vae': sizes_tag = args.layer_spec.replace(",", "-") layer_sizes = [int(i) for i in args.layer_spec.split(",")] layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1] name = "%s-%s-%s-lr%s-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag) if args.activation == "tanh": hidden_act = Tanh() elif args.activation == "logistic": hidden_act = Logistic() elif args.activation == "relu": hidden_act = Rectifier() else: raise "Unknown hidden nonlinearity %s" % args.hidden_act model = VAE(x_dim=x_dim, hidden_layers=layer_sizes, hidden_act=hidden_act, z_dim=z_dim, batch_norm=args.batch_normalization) model.initialize() elif args.method == 'dvae': sizes_tag = args.layer_spec.replace(",", "-") layer_sizes = [int(i) for i in args.layer_spec.split(",")] layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1] name = "%s-%s-%s-lr%s-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag) if args.activation == "tanh": hidden_act = Tanh() elif args.activation == "logistic": hidden_act = Logistic() elif args.activation == "relu": hidden_act = Rectifier() else: raise "Unknown hidden nonlinearity %s" % args.hidden_act model = DVAE(x_dim=x_dim, hidden_layers=layer_sizes, hidden_act=hidden_act, z_dim=z_dim, batch_norm=args.batch_normalization) model.initialize() elif args.method == 'rws': sizes_tag = args.layer_spec.replace(",", "-") qbase = "" if not args.no_qbaseline else "noqb-" name = "%s-%s-%s-%slr%s-dl%d-spl%d-%s" % \ (args.data, args.method, args.name, qbase, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag) p_layers, q_layers = create_layers(args.layer_spec, x_dim, args.deterministic_layers, deterministic_act, deterministic_size) model = ReweightedWakeSleep( p_layers, q_layers, qbaseline=(not args.no_qbaseline), ) model.initialize() elif args.method == 'bihm-rws': sizes_tag = args.layer_spec.replace(",", "-") name = "%s-%s-%s-lr%s-dl%d-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag) p_layers, q_layers = create_layers(args.layer_spec, x_dim, args.deterministic_layers, deterministic_act, deterministic_size) model = BiHM( p_layers, q_layers, l1reg=args.l1reg, l2reg=args.l2reg, ) model.initialize() elif args.method == 'continue': import cPickle as pickle from os.path import basename, splitext with open(args.model_file, 'rb') as f: m = pickle.load(f) if isinstance(m, MainLoop): m = m.model model = m.get_top_bricks()[0] while len(model.parents) > 0: model = model.parents[0] assert isinstance(model, (BiHM, ReweightedWakeSleep, VAE)) mname, _, _ = basename(args.model_file).rpartition("_model.pkl") name = "%s-cont-%s-lr%s-spl%s" % (mname, args.name, lr_tag, args.n_samples) else: raise ValueError("Unknown training method '%s'" % args.method) #------------------------------------------------------------ x = tensor.matrix('features') #------------------------------------------------------------ # Testset monitoring train_monitors = [] valid_monitors = [] test_monitors = [] for s in [1, 10, 100, 1000]: log_p, log_ph = model.log_likelihood(x, s) log_p = -log_p.mean() log_ph = -log_ph.mean() log_p.name = "log_p_%d" % s log_ph.name = "log_ph_%d" % s #train_monitors += [log_p, log_ph] #valid_monitors += [log_p, log_ph] test_monitors += [log_p, log_ph] #------------------------------------------------------------ # Z estimation #for s in [100000]: # z2 = tensor.exp(model.estimate_log_z2(s)) / s # z2.name = "z2_%d" % s # # valid_monitors += [z2] # test_monitors += [z2] #------------------------------------------------------------ # Gradient and training monitoring if args.method in ['vae', 'dvae']: log_p_bound, gradients = model.get_gradients(x, args.n_samples) log_p_bound = -log_p_bound.mean() log_p_bound.name = "log_p_bound" cost = log_p_bound train_monitors += [ log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term') ] valid_monitors += [ log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term') ] test_monitors += [ log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term') ] else: log_p, log_ph, gradients = model.get_gradients(x, args.n_samples) log_p = -log_p.mean() log_ph = -log_ph.mean() log_p.name = "log_p" log_ph.name = "log_ph" cost = log_ph train_monitors += [log_p, log_ph] valid_monitors += [log_p, log_ph] #------------------------------------------------------------ # Detailed monitoring """ n_layers = len(p_layers) log_px, w, log_p, log_q, samples = model.log_likelihood(x, n_samples) exp_samples = [] for l in xrange(n_layers): e = (w.dimshuffle(0, 1, 'x')*samples[l]).sum(axis=1) e.name = "inference_h%d" % l e.tag.aggregation_scheme = aggregation.TakeLast(e) exp_samples.append(e) s1 = samples[1] sh1 = s1.shape s1_ = s1.reshape([sh1[0]*sh1[1], sh1[2]]) s0, _ = model.p_layers[0].sample_expected(s1_) s0 = s0.reshape([sh1[0], sh1[1], s0.shape[1]]) s0 = (w.dimshuffle(0, 1, 'x')*s0).sum(axis=1) s0.name = "inference_h0^" s0.tag.aggregation_scheme = aggregation.TakeLast(s0) exp_samples.append(s0) # Draw P-samples p_samples, _, _ = model.sample_p(100) #weights = model.importance_weights(samples) #weights = weights / weights.sum() for i, s in enumerate(p_samples): s.name = "psamples_h%d" % i s.tag.aggregation_scheme = aggregation.TakeLast(s) # samples = model.sample(100, oversample=100) for i, s in enumerate(samples): s.name = "samples_h%d" % i s.tag.aggregation_scheme = aggregation.TakeLast(s) """ cg = ComputationGraph([cost]) #------------------------------------------------------------ if args.step_rule == "momentum": step_rule = Momentum(args.learning_rate, 0.95) elif args.step_rule == "rmsprop": step_rule = RMSProp(args.learning_rate) elif args.step_rule == "adam": step_rule = Adam(args.learning_rate) else: raise "Unknown step_rule %s" % args.step_rule #parameters = cg.parameters[:4] + cg.parameters[5:] parameters = cg.parameters algorithm = GradientDescent( cost=cost, parameters=parameters, gradients=gradients, step_rule=CompositeRule([ #StepClipping(25), step_rule, #RemoveNotFinite(1.0), ])) #------------------------------------------------------------ train_monitors += [ aggregation.mean(algorithm.total_gradient_norm), aggregation.mean(algorithm.total_step_norm) ] #------------------------------------------------------------ # Live plotting? plotting_extensions = [] if args.live_plotting: plotting_extensions = [ PlotManager( name, [ Plotter(channels=[[ "valid_%s" % cost.name, "valid_log_p" ], ["train_total_gradient_norm", "train_total_step_norm"]], titles=[ "validation cost", "norm of training gradient and step" ]), DisplayImage( [ WeightDisplay(model.p_layers[0].mlp. linear_transformations[0].W, n_weights=100, image_shape=(28, 28)) ] #ImageDataStreamDisplay(test_stream, image_shape=(28,28))] ) ]) ] main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), ProgressBar(), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), DataStreamMonitoring( valid_monitors, data_stream=valid_stream, prefix="valid"), DataStreamMonitoring(test_monitors, data_stream=test_stream, prefix="test", after_epoch=False, after_training=True, every_n_epochs=10), #SharedVariableModifier( # algorithm.step_rule.components[0].learning_rate, # half_lr_func, # before_training=False, # after_epoch=False, # after_batch=False, # every_n_epochs=half_lr), TrackTheBest('valid_%s' % cost.name), Checkpoint(name + ".pkl", save_separately=['log', 'model']), FinishIfNoImprovementAfter('valid_%s_best_so_far' % cost.name, epochs=args.patience), FinishAfter(after_n_epochs=args.max_epochs), Printing() ] + plotting_extensions) main_loop.run()
bn_brick(), LeakyRectifier(leak=LEAK), conv_transpose_brick(4, 1, 64), bn_brick(), LeakyRectifier(leak=LEAK), conv_transpose_brick(4, 2, 32), bn_brick(), LeakyRectifier(leak=LEAK), conv_transpose_brick(5, 1, 32), bn_brick(), LeakyRectifier(leak=LEAK), conv_transpose_brick(1, 1, 32), bn_brick(), LeakyRectifier(leak=LEAK), conv_brick(1, 1, NUM_CHANNELS), Logistic() ] decoder = Decoder(layers=layers, num_channels=(NLAT + NEMB), image_size=(1, 1), weights_init=WEIGHTS_INIT, biases_init=BIASES_INIT) decoder.initialize() decoder_fun = function([z, y], decoder.apply(z, embeddings)) out = decoder_fun(z_hat, test_labels) # Discriminator layers = [ conv_brick(5, 1, 32),
def __init__(self, vocab_size, embedding_dim, state_dim, att_dim, maxout_dim, representation_dim, attention_strategy='content', attention_sources='s', readout_sources='sfa', memory='none', memory_size=500, seq_len=50, init_strategy='last', theano_seed=None, **kwargs): """Creates a new decoder brick without embedding. Args: vocab_size (int): Target language vocabulary size embedding_dim (int): Size of feedback embedding layer state_dim (int): Number of hidden units att_dim (int): Size of attention match vector maxout_dim (int): Size of maxout layer representation_dim (int): Dimension of source annotations attention_strategy (string): Which attention should be used cf. ``_initialize_attention`` attention_sources (string): Defines the sources used by the attention model 's' for decoder states, 'f' for feedback readout_sources (string): Defines the sources used in the readout network. 's' for decoder states, 'f' for feedback, 'a' for attention (context vector) memory (string): Which external memory should be used (cf. ``_initialize_attention``) memory_size (int): Size of the external memory structure seq_len (int): Maximum sentence length init_strategy (string): How to initialize the RNN state (cf. ``GRUInitialState``) theano_seed: Random seed """ super(NoLookupDecoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRUInitialState(attended_dim=state_dim, init_strategy=init_strategy, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism att_dim = att_dim if att_dim > 0 else state_dim self.attention, src_names = _initialize_attention( attention_strategy, seq_len, self.transition, representation_dim, att_dim, attention_sources, readout_sources, memory, memory_size) # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 maxout_dim = maxout_dim if maxout_dim > 0 else state_dim readout = Readout( source_names=src_names, readout_dim=embedding_dim, emitter=NoLookupEmitter(initial_output=-1, readout_dim=embedding_dim, cost_brick=SquaredError()), # cost_brick=CategoricalCrossEntropy()), feedback_brick=TrivialFeedback(output_dim=embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=maxout_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=maxout_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Logistic(name='softmax1').apply ]), merged_dim=maxout_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def test_variable_filter(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name="linear1") brick2 = Bias(2, name="bias1") activation = Logistic(name="sigm") x = tensor.vector() h1 = brick1.apply(x) h2 = activation.apply(h1) h2.name = "h2act" y = brick2.apply(h2) cg = ComputationGraph(y) parameters = [brick1.W, brick1.b, brick2.parameters[0]] bias = [brick1.b, brick2.parameters[0]] brick1_bias = [brick1.b] # Testing filtering by role role_filter = VariableFilter(roles=[PARAMETER]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[FILTER]) assert [] == role_filter(cg.variables) # Testing filtering by role using each_role flag role_filter = VariableFilter(roles=[PARAMETER, BIAS]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True) assert not parameters == role_filter(cg.variables) assert bias == role_filter(cg.variables) # Testing filtering by bricks classes brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by bricks instances brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by brick instance brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by name name_filter = VariableFilter(name="W_norm") assert [cg.variables[2]] == name_filter(cg.variables) # Testing filtering by name regex name_filter_regex = VariableFilter(name_regex="W_no.?m") assert [cg.variables[2]] == name_filter_regex(cg.variables) # Testing filtering by theano name theano_name_filter = VariableFilter(theano_name="h2act") assert [cg.variables[11]] == theano_name_filter(cg.variables) # Testing filtering by theano name regex theano_name_filter_regex = VariableFilter(theano_name_regex="h2a.?t") assert [cg.variables[11]] == theano_name_filter_regex(cg.variables) # Testing filtering by application appli_filter = VariableFilter(applications=[brick1.apply]) variables = [cg.variables[1], cg.variables[8]] assert variables == appli_filter(cg.variables) # Testing filtering by application appli_filter_list = VariableFilter(applications=[brick1.apply]) assert variables == appli_filter_list(cg.variables) input1 = tensor.matrix("input1") input2 = tensor.matrix("input2") merge = Merge(["input1", "input2"], [5, 6], 2) merged = merge.apply(input1, input2) merge_cg = ComputationGraph(merged) outputs = VariableFilter(roles=[OUTPUT], bricks=[merge])(merge_cg.variables) assert merged in outputs assert len(outputs) == 3 outputs_application = VariableFilter(roles=[OUTPUT], applications=[merge.apply])(merge_cg.variables) assert outputs_application == [merged]
weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l4 = Linear(name='l4', input_dim=500, output_dim=500, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l5 = Linear(name='l5', input_dim=500, output_dim=10, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l1.initialize() l2.initialize() l3.initialize() l4.initialize() l5.initialize() a1 = l1.apply(x.flatten(2)/255) a1.name = 'a1' n1, M1, S1 = normalize(a1, output_dim = 500) o1 = Logistic().apply(n1) a2 = l2.apply(o1) n2, M2, S2 = normalize(a2, output_dim = 500) o2 = Logistic().apply(n2) a3 = l3.apply(o2) n3, M3, S3 = normalize(a3, output_dim = 500) o3 = Logistic().apply(n3) a4 = l4.apply(o3) n4, M4, S4 = normalize(a4, output_dim = 500) o4 = Logistic().apply(n4) a5 = l5.apply(o4) n5, M5, S5 = normalize(a5, output_dim = 10)
def create_model_bricks(z_dim, image_size, depth): g_image_size = image_size g_image_size2 = g_image_size / 2 g_image_size3 = g_image_size / 4 g_image_size4 = g_image_size / 8 g_image_size5 = g_image_size / 16 encoder_layers = [] if depth > 0: encoder_layers = encoder_layers + [ Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=32, name='conv1'), SpatialBatchNormalization(name='batch_norm1'), Rectifier(), Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=32, name='conv2'), SpatialBatchNormalization(name='batch_norm2'), Rectifier(), Convolutional( filter_size=(2, 2), step=(2, 2), num_filters=32, name='conv3'), SpatialBatchNormalization(name='batch_norm3'), Rectifier() ] if depth > 1: encoder_layers = encoder_layers + [ Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=64, name='conv4'), SpatialBatchNormalization(name='batch_norm4'), Rectifier(), Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=64, name='conv5'), SpatialBatchNormalization(name='batch_norm5'), Rectifier(), Convolutional( filter_size=(2, 2), step=(2, 2), num_filters=64, name='conv6'), SpatialBatchNormalization(name='batch_norm6'), Rectifier() ] if depth > 2: encoder_layers = encoder_layers + [ Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=128, name='conv7'), SpatialBatchNormalization(name='batch_norm7'), Rectifier(), Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=128, name='conv8'), SpatialBatchNormalization(name='batch_norm8'), Rectifier(), Convolutional( filter_size=(2, 2), step=(2, 2), num_filters=128, name='conv9'), SpatialBatchNormalization(name='batch_norm9'), Rectifier() ] if depth > 3: encoder_layers = encoder_layers + [ Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=256, name='conv10'), SpatialBatchNormalization(name='batch_norm10'), Rectifier(), Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=256, name='conv11'), SpatialBatchNormalization(name='batch_norm11'), Rectifier(), Convolutional(filter_size=(2, 2), step=(2, 2), num_filters=256, name='conv12'), SpatialBatchNormalization(name='batch_norm12'), Rectifier(), ] if depth > 4: encoder_layers = encoder_layers + [ Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=512, name='conv13'), SpatialBatchNormalization(name='batch_norm13'), Rectifier(), Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=512, name='conv14'), SpatialBatchNormalization(name='batch_norm14'), Rectifier(), Convolutional(filter_size=(2, 2), step=(2, 2), num_filters=512, name='conv15'), SpatialBatchNormalization(name='batch_norm15'), Rectifier() ] decoder_layers = [] if depth > 4: decoder_layers = decoder_layers + [ Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=512, name='conv_n3'), SpatialBatchNormalization(name='batch_norm_n3'), Rectifier(), Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=512, name='conv_n2'), SpatialBatchNormalization(name='batch_norm_n2'), Rectifier(), ConvolutionalTranspose( filter_size=(2, 2), step=(2, 2), original_image_size=(g_image_size5, g_image_size5), num_filters=512, name='conv_n1'), SpatialBatchNormalization(name='batch_norm_n1'), Rectifier() ] if depth > 3: decoder_layers = decoder_layers + [ Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=256, name='conv1'), SpatialBatchNormalization(name='batch_norm1'), Rectifier(), Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=256, name='conv2'), SpatialBatchNormalization(name='batch_norm2'), Rectifier(), ConvolutionalTranspose( filter_size=(2, 2), step=(2, 2), original_image_size=(g_image_size4, g_image_size4), num_filters=256, name='conv3'), SpatialBatchNormalization(name='batch_norm3'), Rectifier() ] if depth > 2: decoder_layers = decoder_layers + [ Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=128, name='conv4'), SpatialBatchNormalization(name='batch_norm4'), Rectifier(), Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=128, name='conv5'), SpatialBatchNormalization(name='batch_norm5'), Rectifier(), ConvolutionalTranspose( filter_size=(2, 2), step=(2, 2), original_image_size=(g_image_size3, g_image_size3), num_filters=128, name='conv6'), SpatialBatchNormalization(name='batch_norm6'), Rectifier() ] if depth > 1: decoder_layers = decoder_layers + [ Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=64, name='conv7'), SpatialBatchNormalization(name='batch_norm7'), Rectifier(), Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=64, name='conv8'), SpatialBatchNormalization(name='batch_norm8'), Rectifier(), ConvolutionalTranspose( filter_size=(2, 2), step=(2, 2), original_image_size=(g_image_size2, g_image_size2), num_filters=64, name='conv9'), SpatialBatchNormalization(name='batch_norm9'), Rectifier() ] if depth > 0: decoder_layers = decoder_layers + [ Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=32, name='conv10'), SpatialBatchNormalization(name='batch_norm10'), Rectifier(), Convolutional(filter_size=(3, 3), border_mode=(1, 1), num_filters=32, name='conv11'), SpatialBatchNormalization(name='batch_norm11'), Rectifier(), ConvolutionalTranspose( filter_size=(2, 2), step=(2, 2), original_image_size=(g_image_size, g_image_size), num_filters=32, name='conv12'), SpatialBatchNormalization(name='batch_norm12'), Rectifier() ] decoder_layers = decoder_layers + [ Convolutional(filter_size=(1, 1), num_filters=3, name='conv_out'), Logistic() ] print("creating model of depth {} with {} encoder and {} decoder layers". format(depth, len(encoder_layers), len(decoder_layers))) encoder_convnet = ConvolutionalSequence( layers=encoder_layers, num_channels=3, image_size=(g_image_size, g_image_size), use_bias=False, weights_init=IsotropicGaussian(0.033), biases_init=Constant(0), name='encoder_convnet') encoder_convnet.initialize() encoder_filters = numpy.prod(encoder_convnet.get_dim('output')) encoder_mlp = MLP( dims=[encoder_filters, 1000, z_dim], activations=[ Sequence([BatchNormalization(1000).apply, Rectifier().apply], name='activation1'), Identity().apply ], weights_init=IsotropicGaussian(0.033), biases_init=Constant(0), name='encoder_mlp') encoder_mlp.initialize() decoder_mlp = BatchNormalizedMLP( activations=[Rectifier(), Rectifier()], dims=[encoder_mlp.output_dim // 2, 1000, encoder_filters], weights_init=IsotropicGaussian(0.033), biases_init=Constant(0), name='decoder_mlp') decoder_mlp.initialize() decoder_convnet = ConvolutionalSequence( layers=decoder_layers, num_channels=encoder_convnet.get_dim('output')[0], image_size=encoder_convnet.get_dim('output')[1:], use_bias=False, weights_init=IsotropicGaussian(0.033), biases_init=Constant(0), name='decoder_convnet') decoder_convnet.initialize() return encoder_convnet, encoder_mlp, decoder_convnet, decoder_mlp
def test_variable_filter(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name='linear1') brick2 = Bias(2, name='bias1') activation = Logistic(name='sigm') x = tensor.vector() h1 = brick1.apply(x, call_id='brick1_call_id') h2 = activation.apply(h1, call_id='act') h2.name = "h2act" y = brick2.apply(h2) cg = ComputationGraph(y) parameters = [brick1.W, brick1.b, brick2.parameters[0]] bias = [brick1.b, brick2.parameters[0]] brick1_bias = [brick1.b] # Testing filtering by role role_filter = VariableFilter(roles=[PARAMETER]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[FILTER]) assert [] == role_filter(cg.variables) # Testing filtering by role using each_role flag role_filter = VariableFilter(roles=[PARAMETER, BIAS]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True) assert not parameters == role_filter(cg.variables) assert bias == role_filter(cg.variables) # Testing filtering by bricks classes brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by bricks instances brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by brick instance brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by name name_filter = VariableFilter(name='W_norm') assert [cg.variables[2]] == name_filter(cg.variables) # Testing filtering by name regex name_filter_regex = VariableFilter(name_regex='W_no.?m') assert [cg.variables[2]] == name_filter_regex(cg.variables) # Testing filtering by theano name theano_name_filter = VariableFilter(theano_name='h2act') assert [cg.variables[11]] == theano_name_filter(cg.variables) # Testing filtering by theano name regex theano_name_filter_regex = VariableFilter(theano_name_regex='h2a.?t') assert [cg.variables[11]] == theano_name_filter_regex(cg.variables) brick1_apply_variables = [cg.variables[1], cg.variables[8]] # Testing filtering by application appli_filter = VariableFilter(applications=[brick1.apply]) assert brick1_apply_variables == appli_filter(cg.variables) # Testing filtering by unbound application unbound_appli_filter = VariableFilter(applications=[Linear.apply]) assert brick1_apply_variables == unbound_appli_filter(cg.variables) # Testing filtering by call identifier call_id_filter = VariableFilter(call_id='brick1_call_id') assert brick1_apply_variables == call_id_filter(cg.variables) input1 = tensor.matrix('input1') input2 = tensor.matrix('input2') merge = Merge(['input1', 'input2'], [5, 6], 2) merged = merge.apply(input1, input2) merge_cg = ComputationGraph(merged) outputs = VariableFilter( roles=[OUTPUT], bricks=[merge])(merge_cg.variables) assert merged in outputs assert len(outputs) == 3 outputs_application = VariableFilter( roles=[OUTPUT], applications=[merge.apply])(merge_cg.variables) assert outputs_application == [merged]
def create_model_brick(): layers = [ conv_brick(2, 1, 64), bn_brick(), LeakyRectifier(leak=LEAK), conv_brick(7, 2, 128), bn_brick(), LeakyRectifier(leak=LEAK), conv_brick(5, 2, 256), bn_brick(), LeakyRectifier(leak=LEAK), conv_brick(7, 2, 256), bn_brick(), LeakyRectifier(leak=LEAK), conv_brick(4, 1, 512), bn_brick(), LeakyRectifier(leak=LEAK), conv_brick(1, 1, 2 * NLAT) ] encoder_mapping = ConvolutionalSequence(layers=layers, num_channels=NUM_CHANNELS, image_size=IMAGE_SIZE, use_bias=False, name='encoder_mapping') encoder = GaussianConditional(encoder_mapping, name='encoder') layers = [ conv_transpose_brick(4, 1, 512), bn_brick(), LeakyRectifier(leak=LEAK), conv_transpose_brick(7, 2, 256), bn_brick(), LeakyRectifier(leak=LEAK), conv_transpose_brick(5, 2, 256), bn_brick(), LeakyRectifier(leak=LEAK), conv_transpose_brick(7, 2, 128), bn_brick(), LeakyRectifier(leak=LEAK), conv_transpose_brick(2, 1, 64), bn_brick(), LeakyRectifier(leak=LEAK), conv_brick(1, 1, NUM_CHANNELS), Logistic() ] decoder_mapping = ConvolutionalSequence(layers=layers, num_channels=NLAT, image_size=(1, 1), use_bias=False, name='decoder_mapping') decoder = DeterministicConditional(decoder_mapping, name='decoder') layers = [ conv_brick(2, 1, 64), LeakyRectifier(leak=LEAK), conv_brick(7, 2, 128), bn_brick(), LeakyRectifier(leak=LEAK), conv_brick(5, 2, 256), bn_brick(), LeakyRectifier(leak=LEAK), conv_brick(7, 2, 256), bn_brick(), LeakyRectifier(leak=LEAK), conv_brick(4, 1, 512), bn_brick(), LeakyRectifier(leak=LEAK) ] x_discriminator = ConvolutionalSequence(layers=layers, num_channels=NUM_CHANNELS, image_size=IMAGE_SIZE, use_bias=False, name='x_discriminator') x_discriminator.push_allocation_config() layers = [ conv_brick(1, 1, 1024), LeakyRectifier(leak=LEAK), conv_brick(1, 1, 1024), LeakyRectifier(leak=LEAK) ] z_discriminator = ConvolutionalSequence(layers=layers, num_channels=NLAT, image_size=(1, 1), use_bias=False, name='z_discriminator') z_discriminator.push_allocation_config() layers = [ conv_brick(1, 1, 2048), LeakyRectifier(leak=LEAK), conv_brick(1, 1, 2048), LeakyRectifier(leak=LEAK), conv_brick(1, 1, 1) ] joint_discriminator = ConvolutionalSequence( layers=layers, num_channels=(x_discriminator.get_dim('output')[0] + z_discriminator.get_dim('output')[0]), image_size=(1, 1), name='joint_discriminator') discriminator = XZJointDiscriminator(x_discriminator, z_discriminator, joint_discriminator, name='discriminator') ali = ALI(encoder, decoder, discriminator, weights_init=GAUSSIAN_INIT, biases_init=ZERO_INIT, name='ali') ali.push_allocation_config() encoder_mapping.layers[-1].use_bias = True encoder_mapping.layers[-1].tied_biases = False decoder_mapping.layers[-2].use_bias = True decoder_mapping.layers[-2].tied_biases = False x_discriminator.layers[0].use_bias = True x_discriminator.layers[0].tied_biases = True ali.initialize() raw_marginals, = next( create_celeba_data_streams(500, 500)[0].get_epoch_iterator()) b_value = get_log_odds(raw_marginals) decoder_mapping.layers[-2].b.set_value(b_value) return ali
def build_model_new(fea2obj, num_targets, config, kl_weight, entropy_weight, deterministic=False, test=False ): hidden_size = config['hidden_units'].split() use_highway = str_to_bool(config['use_highway']) if 'use_highway' in config else False use_gaus = str_to_bool(config['use_gaus']) if 'use_gaus' in config else False use_rec = str_to_bool(config['use_rec']) if 'use_rec' in config else True n_latent_z = int(config['n_latent']) if 'use_gaus' in config else 0 use_noise = str_to_bool(config['use_noise']) if 'use_noise' in config else False use_vae=str_to_bool(config['use_vae']) if 'use_vae' in config else False hu_decoder = int(config['hu_decoder']) if 'hu_decoder' in config else hidden_size logger.info('use_gaus: %s, use_rec: %s, use_noise: %s, use_vae: %s, hidden_size: %s, n_latent_z: %d, hu_decoder: %s, hu_encoder: %s', use_gaus, use_rec, use_noise, use_vae, hidden_size, n_latent_z, hu_decoder, hidden_size) init_with_type = str_to_bool(config['init_with_type']) if 'init_with_type' in config else False y = T.matrix('targets', dtype='int32') drop_prob = float(config['dropout']) if 'dropout' in config else 0 #build the feature vector with one model, e.g., with cnn or mean or lstm feature_vec, feature_vec_len = build_feature_vec(fea2obj, config) #drop out if drop_prob > 0: mask = T.cast(srng.binomial(n=1, p=1-drop_prob, size=feature_vec.shape), 'float32') if test: feature_vec *= (1 - drop_prob) else: feature_vec *= mask #Highway network if use_highway: g_mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, feature_vec_len], name='g_mlp') t_mlp = MLP(activations=[Logistic()], dims=[feature_vec_len, feature_vec_len], name='t_mlp') initialize([g_mlp, t_mlp]) t = t_mlp.apply(feature_vec) z = t * g_mlp.apply(feature_vec) + (1. - t) * feature_vec feature_vec = z #MLP(s) logger.info('feature vec length = %s and hidden layer units = %s', feature_vec_len, ' '.join(hidden_size)) if len(hidden_size) > 1: #2 MLP on feature fector mlp = MLP(activations=[Rectifier(), Rectifier()], dims=[feature_vec_len, int(hidden_size[0]), int(hidden_size[1])], name='joint_mlp') initialize([mlp]) before_out = mlp.apply(feature_vec) last_hidden_size = int(hidden_size[1]) else: hidden_size = int(hidden_size[0]) mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, hidden_size], name='joint_mlp') initialize([mlp]) before_out = mlp.apply(feature_vec) last_hidden_size = hidden_size #compute y_hat initial guess hidden_to_output = Linear(name='hidden_to_output', input_dim=last_hidden_size, output_dim=num_targets) typemfile = None if init_with_type: typemfile = config['dsdir'] + '/_typematrix.npy' #typemfile = config['dsdir'] + '/_typeCooccurrMatrix.npy' initialize_lasthid(hidden_to_output, typemfile) # initialize([hidden_to_output]) y_hat_init = Logistic().apply(hidden_to_output.apply(before_out)) y_hat_init.name='y_hat_init' y_hat_init = debug_print(y_hat_init, 'yhat_init', False) logpy_xz_init = cross_entropy_loss(y_hat_init, y) logpy_xz = logpy_xz_init y_hat_recog = y_hat_init y_hat = y_hat_init KLD = 0 if use_gaus: if use_vae: logger.info('using VAE') vae_conditional=str_to_bool(config['vae_cond']) if vae_conditional: y_hat, logpy_xz, KLD, y_hat_recog = build_vae_conditoinal(kl_weight, entropy_weight, y_hat_init, feature_vec, feature_vec_len, config, y, test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder) else: y_hat, logpy_xz, KLD = build_vae_basic(kl_weight, feature_vec, feature_vec_len, config, y, test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder) y_hat_recog = y_hat else: if use_rec: logger.info('Not using VAE... but using recursion') prior_in = T.concatenate([feature_vec, y_hat_init], axis=1) mu_prior, log_sigma_prior = prior_network(x=prior_in, n_input=feature_vec_len+num_targets, hu_encoder=hidden_size, n_latent=n_latent_z) z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise) zl = [T.concatenate([z_prior, feature_vec], axis=1)] y_hat, logpy_xz = generation(zl, n_latent=n_latent_z+feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y) y_hat = (y_hat + y_hat_init) / 2. logpy_xz = (logpy_xz + logpy_xz_init) / 2. else: prior_in = T.concatenate([feature_vec], axis=1) mu_prior, log_sigma_prior = prior_network(x=prior_in, n_input=feature_vec_len, hu_encoder=hidden_size, n_latent=n_latent_z) z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise) zl = [T.concatenate([z_prior, feature_vec], axis=1)] y_hat, logpy_xz = generation(zl, n_latent=n_latent_z+feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y) y_hat_recog = y_hat y_hat = debug_print(y_hat, 'y_hat', False) pat1 = T.mean(y[T.arange(y.shape[0]), T.argmax(y_hat, axis=1)]) max_type = debug_print(T.argmax(y_hat_recog, axis=1), 'max_type', False) pat1_recog = T.mean(y[T.arange(y.shape[0]), max_type]) mean_cross = T.mean(logpy_xz) mean_kld = T.mean(KLD) cost = mean_kld + mean_cross cost.name = 'cost'; mean_kld.name = 'kld'; mean_cross.name = 'cross_entropy_loss'; pat1.name = 'p@1'; pat1_recog.name = 'p@1_recog' misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(y_hat, 0.5)) misclassify_rate.name = 'error_rate' return cost, pat1, y_hat, mean_kld, mean_cross, pat1_recog, misclassify_rate