def test_pylearn2_training(): # Construct the model mlp = MLP(activations=[Sigmoid(), Sigmoid()], dims=[784, 100, 784], weights_init=IsotropicGaussian(), biases_init=Constant(0.01)) mlp.initialize() cost = SquaredError() # Load the data rng = numpy.random.RandomState(14) train_dataset = random_dense_design_matrix(rng, 1024, 784, 10) valid_dataset = random_dense_design_matrix(rng, 1024, 784, 10) x = tensor.matrix('features') block_cost = Pylearn2Cost(cost.apply(x, mlp.apply(x))) block_model = Pylearn2Model(mlp) # Silence Pylearn2's logger logger = logging.getLogger(pylearn2.__name__) logger.setLevel(logging.ERROR) # Training algorithm sgd = SGD(learning_rate=0.01, cost=block_cost, batch_size=128, monitoring_dataset=valid_dataset) train = Pylearn2Train(train_dataset, block_model, algorithm=sgd) train.main_loop(time_budget=3)
def apply(self, input_, target): x_to_h = Linear(name='x_to_h', input_dim=self.dims[0], output_dim=self.dims[1] * 4) pre_rnn = x_to_h.apply(input_) pre_rnn.name = 'pre_rnn' rnn = LSTM(activation=Tanh(), dim=self.dims[1], name=self.name) h, _ = rnn.apply(pre_rnn) h.name = 'h' h_to_y = Linear(name='h_to_y', input_dim=self.dims[1], output_dim=self.dims[2]) y_hat = h_to_y.apply(h) y_hat.name = 'y_hat' cost = SquaredError().apply(target, y_hat) cost.name = 'MSE' self.outputs = {} self.outputs['y_hat'] = y_hat self.outputs['cost'] = cost self.outputs['pre_rnn'] = pre_rnn self.outputs['h'] = h # Initialization for brick in (rnn, x_to_h, h_to_y): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize()
def main(save_to, num_batches, continue_=False): mlp = MLP([Tanh(), Identity()], [1, 10, 1], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), seed=1) mlp.initialize() x = tensor.vector('numbers') y = tensor.vector('roots') cost = SquaredError().apply(y[:, None], mlp.apply(x[:, None])) cost.name = "cost" main_loop = MainLoop( GradientDescent(cost=cost, params=ComputationGraph(cost).parameters, step_rule=Scale(learning_rate=0.001)), get_data_stream(range(100)), model=Model(cost), extensions=([LoadFromDump(save_to)] if continue_ else []) + [ Timing(), FinishAfter(after_n_batches=num_batches), DataStreamMonitoring( [cost], get_data_stream(range(100, 200)), prefix="test"), TrainingDataMonitoring([cost], after_epoch=True), Dump(save_to), Printing() ]) main_loop.run() return main_loop
def main(save_to, num_batches, continue_=False): mlp = MLP([Tanh(), Identity()], [1, 10, 1], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), seed=1) mlp.initialize() x = tensor.vector('numbers') y = tensor.vector('roots') cost = SquaredError().apply(y[:, None], mlp.apply(x[:, None])) cost.name = "cost" main_loop = MainLoop( GradientDescent( cost=cost, params=ComputationGraph(cost).parameters, step_rule=Scale(learning_rate=0.001)), get_data_stream(range(100)), model=Model(cost), extensions=([LoadFromDump(save_to)] if continue_ else []) + [Timing(), FinishAfter(after_n_batches=num_batches), DataStreamMonitoring( [cost], get_data_stream(range(100, 200)), prefix="test"), TrainingDataMonitoring([cost], after_epoch=True), Dump(save_to), Printing()]) main_loop.run() return main_loop
def test_square(): from blocks.bricks.cost import SquaredError x = tensor.tensor3() y = tensor.tensor3() c = SquaredError() o = c.apply(x,y) f = theano.function([x,y],o) print(f(np.ones((3,3,3),dtype=theano.config.floatX),5*np.ones((3,3,3),dtype=theano.config.floatX)))
def get_costs(presoft, args): if has_indices(args.dataset): # Targets: (Time X Batch) y = tensor.lmatrix('targets') y_mask = tensor.ones_like(y, dtype=floatX) y_mask = tensor.set_subtensor( y_mask[:args.context, :], tensor.zeros_like(y_mask[:args.context, :], dtype=floatX)) time, batch, feat = presoft.shape cross_entropy = Softmax().categorical_cross_entropy( (y.flatten() * y_mask.reshape((batch * time, ))), (presoft.reshape( (batch * time, feat)) * y_mask.reshape((batch * time, 1)))) # renormalization renormalized_cross_entropy = cross_entropy * ( tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask)) # BPC: Bits Per Character unregularized_cost = renormalized_cross_entropy / tensor.log(2) unregularized_cost.name = "cross_entropy" else: # Targets: (Time X Batch X Features) y = tensor.tensor3('targets', dtype=floatX) y_mask = tensor.ones_like(y[:, :, 0], dtype=floatX) y_mask = tensor.set_subtensor( y_mask[:args.context, :], tensor.zeros_like(y_mask[:args.context, :], dtype=floatX)) if args.used_inputs is not None: y_mask = tensor.set_subtensor( y_mask[:args.used_inputs, :], tensor.zeros_like(y_mask[:args.used_inputs, :], dtype=floatX)) # SquaredError does not work on 3D tensor target = (y * y_mask.dimshuffle(0, 1, 'x')) values = (presoft[:-1, :, :] * y_mask.dimshuffle(0, 1, 'x')) target = target.reshape( (target.shape[0] * target.shape[1], target.shape[2])) values = values.reshape( (values.shape[0] * values.shape[1], values.shape[2])) unregularized_cost = SquaredError().apply(target, values) # renormalization unregularized_cost = unregularized_cost * ( tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask)) unregularized_cost.name = "mean_squared_error" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = unregularized_cost + tensor.log(1) cost.name = "regularized_cost" return cost, unregularized_cost
def test_square(): from blocks.bricks.cost import SquaredError x = tensor.tensor3() y = tensor.tensor3() c = SquaredError() o = c.apply(x, y) f = theano.function([x, y], o) print( f(np.ones((3, 3, 3), dtype=theano.config.floatX), 5 * np.ones( (3, 3, 3), dtype=theano.config.floatX)))
def decoder(self, clean, corr, batch_size): get_unlabeled = lambda x: x[batch_size:] if x is not None else x est = self.new_activation_dict() costs = AttributeDict() costs.denois = AttributeDict() for i, ((_, spec), act_f) in self.layers[::-1]: z_corr = get_unlabeled(corr.z[i]) z_clean = get_unlabeled(clean.z[i]) z_clean_s = get_unlabeled(clean.s.get(i)) z_clean_m = get_unlabeled(clean.m.get(i)) # It's the last layer if i == len(self.layers) - 1: fspec = (None, None) ver = get_unlabeled(corr.h[i]) ver_dim = self.layer_dims[i] top_g = True else: fspec = self.layers[i + 1][1][0] ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g(z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], num=i, fspec=fspec, top_g=top_g) # For semi-supervised version if z_clean_s: z_est_norm = (z_est - z_clean_m) / z_clean_s else: z_est_norm = z_est z_est_norm = z_est se = SquaredError('denois' + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) \ / np.prod(self.layer_dims[i], dtype=floatX) costs.denois[i].name = 'denois' + str(i) # Store references for later use est.z[i] = z_est est.h[i] = apply_act(z_est, act_f) est.s[i] = None est.m[i] = None return est, costs
def decoder(self, clean, corr): est = self.new_activation_dict() costs = AttributeDict() costs.denois = AttributeDict() for i, ((_, spec), act_f) in self.layers[::-1]: z_corr = corr.unlabeled.z[i] z_clean = clean.unlabeled.z[i] z_clean_s = clean.unlabeled.s.get(i) z_clean_m = clean.unlabeled.m.get(i) # It's the last layer if i == len(self.layers) - 1: fspec = (None, None) ver = corr.unlabeled.h[i] ver_dim = self.layer_dims[i] top_g = True else: fspec = self.layers[i + 1][1][0] ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g(z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], num=i, fspec=fspec, top_g=top_g) # The first layer if z_clean_s: z_est_norm = (z_est - z_clean_m) / z_clean_s else: z_est_norm = z_est se = SquaredError('denois' + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) \ / np.prod(self.layer_dims[i], dtype=floatX) costs.denois[i].name = 'denois' + str(i) # Store references for later use est.z[i] = z_est est.h[i] = apply_act(z_est, act_f) est.s[i] = None est.m[i] = None return est, costs
def decoder(self, clean, corr, batch_size): get_unlabeled = lambda x: x[batch_size:] if x is not None else x est = self.new_activation_dict() costs = AttributeDict() costs.denois = AttributeDict() for i, ((_, spec), act_f) in self.layers[::-1]: z_corr = get_unlabeled(corr.z[i]) z_clean = get_unlabeled(clean.z[i]) z_clean_s = get_unlabeled(clean.s.get(i)) z_clean_m = get_unlabeled(clean.m.get(i)) # It's the last layer if i == len(self.layers) - 1: fspec = (None, None) ver = get_unlabeled(corr.h[i]) ver_dim = self.layer_dims[i] top_g = True else: fspec = self.layers[i + 1][1][0] ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g( z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], num=i, fspec=fspec, top_g=top_g ) # For semi-supervised version if z_clean_s: z_est_norm = (z_est - z_clean_m) / z_clean_s else: z_est_norm = z_est z_est_norm = z_est se = SquaredError("denois" + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) / np.prod( self.layer_dims[i], dtype=floatX ) costs.denois[i].name = "denois" + str(i) # Store references for later use est.z[i] = z_est est.h[i] = apply_act(z_est, act_f) est.s[i] = None est.m[i] = None return est, costs
def test_collect(): x = tensor.matrix() mlp = MLP(activations=[Logistic(), Logistic()], dims=[784, 100, 784], use_bias=False) cost = SquaredError().apply(x, mlp.apply(x)) cg = ComputationGraph(cost) var_filter = VariableFilter(roles=[PARAMETER]) W1, W2 = var_filter(cg.variables) for i, W in enumerate([W1, W2]): W.set_value(numpy.ones_like(W.get_value()) * (i + 1)) new_cg = collect_parameters(cg, cg.shared_variables) collected_parameters, = new_cg.shared_variables assert numpy.all(collected_parameters.get_value()[:784 * 100] == 1.) assert numpy.all(collected_parameters.get_value()[784 * 100:] == 2.) assert collected_parameters.ndim == 1 W1, W2 = VariableFilter(roles=[COLLECTED])(new_cg.variables) assert W1.eval().shape == (784, 100) assert numpy.all(W1.eval() == 1.) assert W2.eval().shape == (100, 784) assert numpy.all(W2.eval() == 2.)
def build_autoencoder(features, labels_num, labels_cat): mlp_bottom = MLP(activations=[ Rectifier(), Rectifier(), Rectifier(), Rectifier(), Rectifier() ], dims=[24033, 5000, 1000, 100, 1000, 5000], weights_init=IsotropicGaussian(), biases_init=Constant(1)) mlp_bottom.initialize() mlp_top = build_top_mlp() mlp_top.push_initialization_config() mlp_top.initialize() # a = mlp_bottom.apply(features) # b = mlp_top.apply(a) # Construct feedforward sequence ss_seq = Sequence([mlp_bottom.apply, mlp_top.apply]) ss_seq.push_initialization_config() ss_seq.initialize() [outputs_numerical, outputs_categorical] = ss_seq.apply(features) cost = SquaredError().apply( labels_num, outputs_numerical) + BinaryCrossEntropy().apply( labels_cat, outputs_categorical) cg = ComputationGraph(cost) #cg_dropout0 = apply_dropout(cg, [VariableFilter(roles=[INPUT])(cg.variables)[1]], .2) #cg_dropout1 = apply_dropout(cg, [VariableFilter(roles=[OUTPUT])(cg.variables)[1], VariableFilter(roles=[OUTPUT])(cg.variables)[3]], .2) #cost_dropout1 = cg_dropout1.outputs[0] return cost, cg.parameters
def train(self): x = self.sharedBatch['x'] x.name = 'x_myinput' xmini = self.sharedBatch['xmini'] xmini.name = 'xmini_myinput' y = self.sharedBatch['y'] y.name = 'y_myinput' # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(self.input_dimx, self.dim, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) xmini_to_h = Linear(self.input_dimxmini, self.mini_dim, name='xmini_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) rnnwmini = RNNwMini(dim=self.dim, mini_dim=self.mini_dim, summary_dim=self.summary_dim) h_to_o = Linear(self.summary_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) xmini_transform = xmini_to_h.apply(xmini) h = rnnwmini.apply(x=x_transform, xmini=xmini_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) #y_hat = Logistic().apply(y_hat) cost = SquaredError().apply(y, y_hat) cost.name = 'cost' rnnwmini.initialize() x_to_h.initialize() xmini_to_h.initialize() h_to_o.initialize() self.f = theano.function(inputs=[], outputs=y_hat) #print("self.f === ") #print(self.f()) #print(self.f().shape) #print("====") self.cg = ComputationGraph(cost) m = Model(cost) algorithm = GradientDescent(cost=cost, parameters=self.cg.parameters, step_rule=RMSProp(learning_rate=0.01), on_unused_sources='ignore') valid_monitor = DataStreamMonitoringShared( variables=[cost], data_stream=self.stream_valid_int, prefix="valid", sharedBatch=self.sharedBatch, sharedData=self.sharedData) train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) sharedVarMonitor = SwitchSharedReferences(self.sharedBatch, self.sharedData) tBest = self.track_best('valid_cost', self.cg) self.tracker = tBest[0] extensions = [sharedVarMonitor, valid_monitor] + tBest if self.debug: extensions.append(Printing()) self.algorithm = algorithm self.extensions = extensions self.model = m self.mainloop = MainLoop(self.algorithm, self.stream_train_int, extensions=self.extensions, model=self.model) self.main_loop(True)
def apply(self, input_labeled, target_labeled, input_unlabeled): self.target_labeled = target_labeled self.layer_counter = 0 input_dim = self.p.encoder_layers[0] # Store the dimension tuples in the same order as layers. layers = self.layers self.layer_dims = {0: input_dim} self.lr = self.default_lr self.costs = costs = AttributeDict() self.costs.denois = AttributeDict() self.act = AttributeDict() self.error = AttributeDict() top = len(layers) - 1 N = input_labeled.shape[0] self.join = lambda l, u: T.concatenate([l, u], axis=0) self.labeled = lambda x: x[:N] if x is not None else x self.unlabeled = lambda x: x[N:] if x is not None else x self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x)) input_concat = self.join(input_labeled, input_unlabeled) def encoder(input_, path_name, input_noise_std=0, noise_std=[]): h = input_ logger.info(' 0: noise %g' % input_noise_std) if input_noise_std > 0.: h = h + self.noise_like(h) * input_noise_std d = AttributeDict() d.unlabeled = self.new_activation_dict() d.labeled = self.new_activation_dict() d.labeled.z[0] = self.labeled(h) d.unlabeled.z[0] = self.unlabeled(h) prev_dim = input_dim for i, (spec, _, act_f) in layers[1:]: d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h) noise = noise_std[i] if i < len(noise_std) else 0. curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f, path_name=path_name, noise_std=noise) assert self.layer_dims.get(i) in (None, curr_dim) self.layer_dims[i] = curr_dim d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z) d.unlabeled.s[i] = s d.unlabeled.m[i] = m prev_dim = curr_dim d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h) return d # Clean, supervised logger.info('Encoder: clean, labeled') clean = self.act.clean = encoder(input_concat, 'clean') # Corrupted, supervised logger.info('Encoder: corr, labeled') corr = self.act.corr = encoder(input_concat, 'corr', input_noise_std=self.p.super_noise_std, noise_std=self.p.f_local_noise_std) est = self.act.est = self.new_activation_dict() # Decoder path in opposite order logger.info('Decoder: z_corr -> z_est') for i, ((_, spec), l_type, act_f) in layers[::-1]: z_corr = corr.unlabeled.z[i] z_clean = clean.unlabeled.z[i] z_clean_s = clean.unlabeled.s.get(i) z_clean_m = clean.unlabeled.m.get(i) fspec = layers[i+1][1][0] if len(layers) > i+1 else (None, None) if i == top: ver = corr.unlabeled.h[i] ver_dim = self.layer_dims[i] top_g = True else: ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g(z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], l_type=l_type, num=i, fspec=fspec, top_g=top_g) if z_est is not None: # Denoising cost if z_clean_s and self.p.zestbn == 'bugfix': z_est_norm = (z_est - z_clean_m) / T.sqrt(z_clean_s + np.float32(1e-10)) elif z_clean_s is None or self.p.zestbn == 'no': z_est_norm = z_est else: assert False, 'Not supported path' se = SquaredError('denois' + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) \ / np.prod(self.layer_dims[i], dtype=floatX) costs.denois[i].name = 'denois' + str(i) denois_print = 'denois %.2f' % self.p.denoising_cost_x[i] else: denois_print = '' # Store references for later use est.h[i] = self.apply_act(z_est, act_f) est.z[i] = z_est est.s[i] = None est.m[i] = None logger.info(' g%d: %10s, %s, dim %s -> %s' % ( i, l_type, denois_print, self.layer_dims.get(i+1), self.layer_dims.get(i) )) # Costs y = target_labeled.flatten() costs.class_clean = CategoricalCrossEntropy().apply(y, clean.labeled.h[top]) costs.class_clean.name = 'cost_class_clean' costs.class_corr = CategoricalCrossEntropy().apply(y, corr.labeled.h[top]) costs.class_corr.name = 'cost_class_corr' # This will be used for training costs.total = costs.class_corr * 1.0 for i in range(top + 1): if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0: costs.total += costs.denois[i] * self.p.denoising_cost_x[i] costs.total.name = 'cost_total' # Classification error mr = MisclassificationRate() self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.) self.error.clean.name = 'error_rate_clean'
input_dim = 512 hidden_dims = [int(dim) for dim in args.dim.split(",")] if args.batchnorm: network = BatchNormalizedMLP else: network = MLP autoencoder = network(activations=[Tanh() for _ in xrange(len(hidden_dims))] + [Identity()], dims=[input_dim] + hidden_dims + [input_dim], weights_init=Uniform(width=0.02), biases_init=Constant(0)) autoencoder.initialize() hopefully_states_again = autoencoder.apply(states) cost = SquaredError().apply(hopefully_states_again, states) cost.name = "squared_error" cost_model = Model(cost) algorithm = GradientDescent(cost=cost, parameters=cost_model.parameters, step_rule=Adam()) # handle data data = H5PYDataset(args.file, which_sets=("train",), load_in_memory=True) # trash data for testing """ dataraw = numpy.zeros((10000, 512), dtype="float32") for row in xrange(dataraw.shape[0]): dataraw[row] = numpy.random.rand(512) data = OrderedDict() data["act_seqs"] = dataraw
def apply(self, input_labeled, target_labeled, input_unlabeled): self.layer_counter = 0 input_dim = self.p.encoder_layers[0] # Store the dimension tuples in the same order as layers. layers = self.layers self.layer_dims = {0: input_dim} self.lr = self.shared(self.default_lr, 'learning_rate', role=None) self.costs = costs = AttributeDict() self.costs.denois = AttributeDict() self.act = AttributeDict() self.error = AttributeDict() self.oos = AttributeDict() top = len(layers) - 1 N = input_labeled.shape[0] self.join = lambda l, u: T.concatenate([l, u], axis=0) self.labeled = lambda x: x[:N] if x is not None else x self.unlabeled = lambda x: x[N:] if x is not None else x self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x)) input_concat = self.join(input_labeled, input_unlabeled) def encoder(input_, path_name, input_noise_std=0, noise_std=[]): h = input_ logger.info(' 0: noise %g' % input_noise_std) if input_noise_std > 0.: h = h + self.noise_like(h) * input_noise_std d = AttributeDict() d.unlabeled = self.new_activation_dict() d.labeled = self.new_activation_dict() d.labeled.z[0] = self.labeled(h) d.unlabeled.z[0] = self.unlabeled(h) prev_dim = input_dim for i, (spec, _, act_f) in layers[1:]: d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h) noise = noise_std[i] if i < len(noise_std) else 0. curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f, path_name=path_name, noise_std=noise) assert self.layer_dims.get(i) in (None, curr_dim) self.layer_dims[i] = curr_dim d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z) d.unlabeled.s[i] = s d.unlabeled.m[i] = m prev_dim = curr_dim d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h) return d # Clean, supervised logger.info('Encoder: clean, labeled') clean = self.act.clean = encoder(input_concat, 'clean') # Corrupted, supervised logger.info('Encoder: corr, labeled') corr = self.act.corr = encoder(input_concat, 'corr', input_noise_std=self.p.super_noise_std, noise_std=self.p.f_local_noise_std) est = self.act.est = self.new_activation_dict() # Decoder path in opposite order logger.info('Decoder: z_corr -> z_est') for i, ((_, spec), l_type, act_f) in layers[::-1]: z_corr = corr.unlabeled.z[i] z_clean = clean.unlabeled.z[i] z_clean_s = clean.unlabeled.s.get(i) z_clean_m = clean.unlabeled.m.get(i) fspec = layers[i + 1][1][0] if len(layers) > i + 1 else (None, None) if i == top: ver = corr.unlabeled.h[i] ver_dim = self.layer_dims[i] top_g = True else: ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g(z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], l_type=l_type, num=i, fspec=fspec, top_g=top_g) if z_est is not None: # Denoising cost if z_clean_s and self.p.zestbn == 'bugfix': z_est_norm = (z_est - z_clean_m ) / T.sqrt(z_clean_s + np.float32(1e-10)) elif z_clean_s is None or self.p.zestbn == 'no': z_est_norm = z_est else: assert False, 'Not supported path' se = SquaredError('denois' + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) \ / np.prod(self.layer_dims[i], dtype=floatX) costs.denois[i].name = 'denois' + str(i) denois_print = 'denois %.2f' % self.p.denoising_cost_x[i] else: denois_print = '' # Store references for later use est.h[i] = self.apply_act(z_est, act_f) est.z[i] = z_est est.s[i] = None est.m[i] = None logger.info(' g%d: %10s, %s, dim %s -> %s' % (i, l_type, denois_print, self.layer_dims.get(i + 1), self.layer_dims.get(i))) # Costs y = target_labeled.flatten() Q = int(self.layer_dims[top][0]) - 1 logger.info('Q=%d' % Q) costs.class_clean = CategoricalCrossEntropyIV( Q=Q, alpha=self.p.alpha, beta=self.p.beta, dbeta=self.p.dbeta, gamma=self.p.gamma, gamma1=self.p.gamma1).apply(y, clean.labeled.h[top]) costs.class_clean.name = 'cost_class_clean' costs.class_corr = CategoricalCrossEntropyIV( Q=Q, alpha=self.p.alpha, beta=self.p.beta, dbeta=self.p.dbeta, gamma=self.p.gamma, gamma1=self.p.gamma1, ).apply(y, corr.labeled.h[top]) costs.class_corr.name = 'cost_class_corr' # This will be used for training costs.total = costs.class_corr * 1.0 for i in range(top + 1): if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0: costs.total += costs.denois[i] * self.p.denoising_cost_x[i] if self.p.alpha_clean: y_true = y eps = np.float32(1e-6) # scale preds so that the class probas of each sample sum to 1 y_pred = clean.labeled.h[top] + eps y_pred /= y_pred.sum(axis=-1, keepdims=True) y0 = T.or_(T.eq(y_true, 0), T.gt(y_true, Q)) # out-of-set or unlabeled y0sum = y0.sum() + eps # number of oos cost1 = T.nnet.categorical_crossentropy(y_pred, y_pred) cost1 = T.dot(y0, cost1) / y0sum # average cost per labeled example costs.total += self.p.alpha_clean * cost1 costs.total.name = 'cost_total' # Classification error mr = MisclassificationRateIV(oos_thr=self.p.oos_thr) self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.) self.error.clean.name = 'error_rate_clean' oosr = OOSRateIV() self.oos.clean = oosr.apply(y, clean.labeled.h[top]) * np.float32(100.) self.oos.clean.name = 'oos_rate_clean'
def get_costs(presoft, args): if has_indices(args.dataset): # Targets: (Time X Batch) y = tensor.lmatrix('targets') y_mask = tensor.ones_like(y, dtype=floatX) y_mask = tensor.set_subtensor(y_mask[:args.context, :], tensor.zeros_like(y_mask[:args.context, :], dtype=floatX)) time, batch, feat = presoft.shape cross_entropy = Softmax().categorical_cross_entropy( (y.flatten() * y_mask.reshape((batch * time, ))), (presoft.reshape((batch * time, feat)) * y_mask.reshape((batch * time, 1)))) # renormalization renormalized_cross_entropy = cross_entropy * ( tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask)) # BPC: Bits Per Character unregularized_cost = renormalized_cross_entropy / tensor.log(2) unregularized_cost.name = "cross_entropy" else: # Targets: (Time X Batch X Features) y = tensor.tensor3('targets', dtype=floatX) y_mask = tensor.ones_like(y[:, :, 0], dtype=floatX) y_mask = tensor.set_subtensor(y_mask[:args.context, :], tensor.zeros_like(y_mask[:args.context, :], dtype=floatX)) if args.used_inputs is not None: y_mask = tensor.set_subtensor(y_mask[:args.used_inputs, :], tensor.zeros_like(y_mask[:args.used_inputs, :], dtype=floatX)) # SquaredError does not work on 3D tensor target = (y * y_mask.dimshuffle(0, 1, 'x')) values = (presoft[:-1, :, :] * y_mask.dimshuffle(0, 1, 'x')) target = target.reshape((target.shape[0] * target.shape[1], target.shape[2])) values = values.reshape((values.shape[0] * values.shape[1], values.shape[2])) unregularized_cost = SquaredError().apply(target, values) # renormalization unregularized_cost = unregularized_cost * ( tensor.sum(tensor.ones_like(y_mask)) / tensor.sum(y_mask)) unregularized_cost.name = "mean_squared_error" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = unregularized_cost + tensor.log(1) cost.name = "regularized_cost" return cost, unregularized_cost
def apply(self, input_labeled, target_labeled, input_unlabeled): self.layer_counter = 0 input_dim = self.p.encoder_layers[0] # Store the dimension tuples in the same order as layers. layers = self.layers self.layer_dims = {0: input_dim} self.lr = self.shared(self.default_lr, 'learning_rate', role=None) self.costs = costs = AttributeDict() self.costs.denois = AttributeDict() self.act = AttributeDict() self.error = AttributeDict() top = len(layers) - 1 N = input_labeled.shape[0] self.join = lambda l, u: T.concatenate([l, u], axis=0) self.labeled = lambda x: x[:N] if x is not None else x self.unlabeled = lambda x: x[N:] if x is not None else x self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x)) input_concat = self.join(input_labeled, input_unlabeled) def encoder(input_, path_name, input_noise_std=0, noise_std=[]): h = input_ logger.info(' 0: noise %g' % input_noise_std) if input_noise_std > 0.: h = h + self.noise_like(h) * input_noise_std d = AttributeDict() d.unlabeled = self.new_activation_dict() d.labeled = self.new_activation_dict() d.labeled.z[0] = self.labeled(h) d.unlabeled.z[0] = self.unlabeled(h) prev_dim = input_dim for i, (spec, _, act_f) in layers[1:]: d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h) noise = noise_std[i] if i < len(noise_std) else 0. curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f, path_name=path_name, noise_std=noise) assert self.layer_dims.get(i) in (None, curr_dim) self.layer_dims[i] = curr_dim d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z) d.unlabeled.s[i] = s d.unlabeled.m[i] = m prev_dim = curr_dim d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h) return d # Clean, supervised logger.info('Encoder: clean, labeled') clean = self.act.clean = encoder(input_concat, 'clean') # Corrupted, supervised logger.info('Encoder: corr, labeled') corr = self.act.corr = encoder(input_concat, 'corr', input_noise_std=self.p.super_noise_std, noise_std=self.p.f_local_noise_std) est = self.act.est = self.new_activation_dict() # Decoder path in opposite order logger.info('Decoder: z_corr -> z_est') for i, ((_, spec), l_type, act_f) in layers[::-1]: z_corr = corr.unlabeled.z[i] z_clean = clean.unlabeled.z[i] z_clean_s = clean.unlabeled.s.get(i) z_clean_m = clean.unlabeled.m.get(i) fspec = layers[i + 1][1][0] if len(layers) > i + 1 else (None, None) if i == top: ver = corr.unlabeled.h[i] ver_dim = self.layer_dims[i] top_g = True else: ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g(z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], l_type=l_type, num=i, fspec=fspec, top_g=top_g) if z_est is not None: # Denoising cost if z_clean_s: z_est_norm = (z_est - z_clean_m) / z_clean_s else: z_est_norm = z_est se = SquaredError('denois' + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) \ / np.prod(self.layer_dims[i], dtype=floatX) costs.denois[i].name = 'denois' + str(i) denois_print = 'denois %.2f' % self.p.denoising_cost_x[i] else: denois_print = '' # Store references for later use est.h[i] = self.apply_act(z_est, act_f) est.z[i] = z_est est.s[i] = None est.m[i] = None logger.info(' g%d: %10s, %s, dim %s -> %s' % (i, l_type, denois_print, self.layer_dims.get(i + 1), self.layer_dims.get(i))) # Costs y = target_labeled.flatten() costs.class_clean = CategoricalCrossEntropy().apply( y, clean.labeled.h[top]) costs.class_clean.name = 'cost_class_clean' costs.class_corr = CategoricalCrossEntropy().apply( y, corr.labeled.h[top]) costs.class_corr.name = 'cost_class_corr' # This will be used for training costs.total = costs.class_corr * 1.0 for i in range(top + 1): if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0: costs.total += costs.denois[i] * self.p.denoising_cost_x[i] costs.total.name = 'cost_total' # Classification error mr = MisclassificationRate() self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.) self.error.clean.name = 'error_rate_clean'
def train_lstm(train, test, input_dim, hidden_dimension, columns, epochs, save_file, execution_name, batch_size, plot): stream_train = build_stream(train, batch_size, columns) stream_test = build_stream(test, batch_size, columns) # The train stream will return (TimeSequence, BatchSize, Dimensions) for # and the train test will return (TimeSequence, BatchSize, 1) x = T.tensor3('x') y = T.tensor3('y') y = y.reshape((y.shape[1], y.shape[0], y.shape[2])) # input_dim = 6 # output_dim = 1 linear_lstm = LinearLSTM(input_dim, 1, hidden_dimension, # print_intermediate=True, print_attrs=['__str__', 'shape']) y_hat = linear_lstm.apply(x) linear_lstm.initialize() c_test = AbsolutePercentageError().apply(y, y_hat) c_test.name = 'mape' c = SquaredError().apply(y, y_hat) c.name = 'cost' cg = ComputationGraph(c_test) def one_perc_min(current_value, best_value): if (1 - best_value / current_value) > 0.01: return best_value else: return current_value extensions = [] extensions.append(DataStreamMonitoring(variables=[c, c_test], data_stream=stream_test, prefix='test', after_epoch=False, every_n_epochs=100)) extensions.append(TrainingDataMonitoring(variables=[c_test], prefix='train', after_epoch=True)) extensions.append(FinishAfter(after_n_epochs=epochs)) # extensions.append(Printing()) # extensions.append(ProgressBar()) extensions.append(TrackTheBest('test_mape', choose_best=one_perc_min)) extensions.append(TrackTheBest('test_cost', choose_best=one_perc_min)) extensions.append(FinishIfNoImprovementAfter('test_cost_best_so_far', epochs=500)) # Save only parameters, not the whole main loop and only when best_test_cost is updated checkpoint = Checkpoint(save_file, save_main_loop=False, after_training=False) checkpoint.add_condition(['after_epoch'], predicate=OnLogRecord('test_cost_best_so_far')) extensions.append(checkpoint) if BOKEH_AVAILABLE and plot: extensions.append(Plot(execution_name, channels=[[ # 'train_cost', 'test_cost']])) step_rule = Adam() algorithm = GradientDescent(cost=c_test, parameters=cg.parameters, step_rule=step_rule) main_loop = MainLoop(algorithm, stream_train, model=Model(c_test), extensions=extensions) main_loop.run() test_mape = 0 if main_loop.log.status.get('best_test_mape', None) is None: with open(save_file, 'rb') as f: parameters = load_parameters(f) model = main_loop.model model.set_parameter_values(parameters) ev = DatasetEvaluator([c_test]) test_mape = ev.evaluate(stream_test)['mape'] else: test_mape = main_loop.log.status['best_test_mape'] return test_mape, main_loop.log.status['epochs_done']
test_dataset = [word_bank.convert_to_vectors_and_labels(sentence) for sentence in test_sentences] # MODEL SETUP textRNN = TextRNN(dim_in=VECTOR_SIZE, dim_hidden=HIDDEN_UNITS, dim_out=VECTOR_SIZE) output = textRNN.run(inputs=x) #get_states_and_output = T.function([x, x_mask], [output]) # COST SETUP #y_hat = np.float32(np.ones((3,1))) labels = np.float32([data[1] for data in dataset]) inputs_data = np.float32([data[0] for data in dataset]) test_labels = np.float32([data[1] for data in test_dataset]) test_inputs_data = np.float32([data[0] for data in test_dataset]) cost = SquaredError().apply(y, output) cost.name = 'MSE_with_regularization' cg = ComputationGraph(cost) #inputs = VariableFilter(roles=[INPUT], bricks=[SimpleRecurrent])(cg.variables) #inputs = [inputs[0]] #cg_dropout = apply_dropout(cg, inputs, 0.5) #fprop_dropout = T.function([cg_dropout.inputs], [cg_dropout.outputs[0]]) #dropped_out = VariableFilter(roles=[DROPOUT])(cg.variables) #inputs_referenced = [var.tag.replacement_of for var in dropped_out] #set(inputs) == set(inputs_referenced) get_states_and_output = T.function([x], [output]) #W = VariableFilter(roles=[WEIGHT])(cg.variables) #W = W
#lstm.weights_init = Orthogonal() lstm.biases_init = Constant(0.) lstm.initialize() #ComputationGraph(encode.apply(x)).get_theano_function()(features_test)[0].shape #ComputationGraph(lstm.apply(encoded)).get_theano_function()(features_test) #ComputationGraph(decode.apply(hiddens[-1])).get_theano_function()(features_test)[0].shape #ComputationGraph(SquaredError().apply(y, y_hat.flatten())).get_theano_function()(features_test, targets_test)[0].shape encoded = encode.apply(x) #hiddens = lstm.apply(encoded, gates.apply(x)) hiddens = lstm.apply(encoded) y_hat = decode.apply(hiddens[-1]) cost = SquaredError().apply(y, y_hat) cost.name = 'cost' #ipdb.set_trace() #ComputationGraph(y_hat).get_theano_function()(features_test)[0].shape #ComputationGraph(cost).get_theano_function()(features_test, targets_test)[0].shape cg = ComputationGraph(cost) #cg = ComputationGraph(hiddens).get_theano_function() #ipdb.set_trace() algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(5.0), Scale(0.01)]))
def __init__(self, vocab_size, embedding_dim, state_dim, att_dim, maxout_dim, representation_dim, attention_strategy='content', attention_sources='s', readout_sources='sfa', memory='none', memory_size=500, seq_len=50, init_strategy='last', theano_seed=None, **kwargs): """Creates a new decoder brick without embedding. Args: vocab_size (int): Target language vocabulary size embedding_dim (int): Size of feedback embedding layer state_dim (int): Number of hidden units att_dim (int): Size of attention match vector maxout_dim (int): Size of maxout layer representation_dim (int): Dimension of source annotations attention_strategy (string): Which attention should be used cf. ``_initialize_attention`` attention_sources (string): Defines the sources used by the attention model 's' for decoder states, 'f' for feedback readout_sources (string): Defines the sources used in the readout network. 's' for decoder states, 'f' for feedback, 'a' for attention (context vector) memory (string): Which external memory should be used (cf. ``_initialize_attention``) memory_size (int): Size of the external memory structure seq_len (int): Maximum sentence length init_strategy (string): How to initialize the RNN state (cf. ``GRUInitialState``) theano_seed: Random seed """ super(NoLookupDecoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRUInitialState(attended_dim=state_dim, init_strategy=init_strategy, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism att_dim = att_dim if att_dim > 0 else state_dim self.attention, src_names = _initialize_attention( attention_strategy, seq_len, self.transition, representation_dim, att_dim, attention_sources, readout_sources, memory, memory_size) # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 maxout_dim = maxout_dim if maxout_dim > 0 else state_dim readout = Readout( source_names=src_names, readout_dim=embedding_dim, emitter=NoLookupEmitter(initial_output=-1, readout_dim=embedding_dim, cost_brick=SquaredError()), # cost_brick=CategoricalCrossEntropy()), feedback_brick=TrivialFeedback(output_dim=embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=maxout_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=maxout_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Logistic(name='softmax1').apply ]), merged_dim=maxout_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def apply(self, input_labeled, target_labeled, input_unlabeled): self.layer_counter = 0 input_dim = self.p.encoder_layers[0] # Store the dimension tuples in the same order as layers. layers = self.layers self.layer_dims = {0: input_dim} self.lr = self.shared(self.default_lr, 'learning_rate', role=None) self.costs = costs = AttributeDict() self.costs.denois = AttributeDict() self.act = AttributeDict() self.error = AttributeDict() self.oos = AttributeDict() top = len(layers) - 1 N = input_labeled.shape[0] self.join = lambda l, u: T.concatenate([l, u], axis=0) self.labeled = lambda x: x[:N] if x is not None else x self.unlabeled = lambda x: x[N:] if x is not None else x self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x)) input_concat = self.join(input_labeled, input_unlabeled) def encoder(input_, path_name, input_noise_std=0, noise_std=[]): h = input_ logger.info(' 0: noise %g' % input_noise_std) if input_noise_std > 0.: h = h + self.noise_like(h) * input_noise_std d = AttributeDict() d.unlabeled = self.new_activation_dict() d.labeled = self.new_activation_dict() d.labeled.z[0] = self.labeled(h) d.unlabeled.z[0] = self.unlabeled(h) prev_dim = input_dim for i, (spec, _, act_f) in layers[1:]: d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h) noise = noise_std[i] if i < len(noise_std) else 0. curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f, path_name=path_name, noise_std=noise) assert self.layer_dims.get(i) in (None, curr_dim) self.layer_dims[i] = curr_dim d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z) d.unlabeled.s[i] = s d.unlabeled.m[i] = m prev_dim = curr_dim d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h) return d # Clean, supervised logger.info('Encoder: clean, labeled') clean = self.act.clean = encoder(input_concat, 'clean') # Corrupted, supervised logger.info('Encoder: corr, labeled') corr = self.act.corr = encoder(input_concat, 'corr', input_noise_std=self.p.super_noise_std, noise_std=self.p.f_local_noise_std) est = self.act.est = self.new_activation_dict() # Decoder path in opposite order logger.info('Decoder: z_corr -> z_est') for i, ((_, spec), l_type, act_f) in layers[::-1]: z_corr = corr.unlabeled.z[i] z_clean = clean.unlabeled.z[i] z_clean_s = clean.unlabeled.s.get(i) z_clean_m = clean.unlabeled.m.get(i) fspec = layers[i+1][1][0] if len(layers) > i+1 else (None, None) if i == top: ver = corr.unlabeled.h[i] ver_dim = self.layer_dims[i] top_g = True else: ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g(z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], l_type=l_type, num=i, fspec=fspec, top_g=top_g) if z_est is not None: # Denoising cost if z_clean_s and self.p.zestbn == 'bugfix': z_est_norm = (z_est - z_clean_m) / T.sqrt(z_clean_s + np.float32(1e-10)) elif z_clean_s is None or self.p.zestbn == 'no': z_est_norm = z_est else: assert False, 'Not supported path' se = SquaredError('denois' + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) \ / np.prod(self.layer_dims[i], dtype=floatX) costs.denois[i].name = 'denois' + str(i) denois_print = 'denois %.2f' % self.p.denoising_cost_x[i] else: denois_print = '' # Store references for later use est.h[i] = self.apply_act(z_est, act_f) est.z[i] = z_est est.s[i] = None est.m[i] = None logger.info(' g%d: %10s, %s, dim %s -> %s' % ( i, l_type, denois_print, self.layer_dims.get(i+1), self.layer_dims.get(i) )) # Costs y = target_labeled.flatten() Q = int(self.layer_dims[top][0]) - 1 logger.info('Q=%d'%Q) costs.class_clean = CategoricalCrossEntropyIV(Q=Q, alpha=self.p.alpha, beta=self.p.beta, dbeta=self.p.dbeta, gamma=self.p.gamma, gamma1=self.p.gamma1 ).apply(y, clean.labeled.h[top]) costs.class_clean.name = 'cost_class_clean' costs.class_corr = CategoricalCrossEntropyIV(Q=Q, alpha=self.p.alpha, beta=self.p.beta, dbeta=self.p.dbeta, gamma=self.p.gamma, gamma1=self.p.gamma1, ).apply(y, corr.labeled.h[top]) costs.class_corr.name = 'cost_class_corr' # This will be used for training costs.total = costs.class_corr * 1.0 for i in range(top + 1): if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0: costs.total += costs.denois[i] * self.p.denoising_cost_x[i] if self.p.alpha_clean: y_true = y eps = np.float32(1e-6) # scale preds so that the class probas of each sample sum to 1 y_pred = clean.labeled.h[top] + eps y_pred /= y_pred.sum(axis=-1, keepdims=True) y0 = T.or_(T.eq(y_true, 0), T.gt(y_true, Q)) # out-of-set or unlabeled y0sum = y0.sum() + eps # number of oos cost1 = T.nnet.categorical_crossentropy(y_pred, y_pred) cost1 = T.dot(y0, cost1) / y0sum # average cost per labeled example costs.total += self.p.alpha_clean * cost1 costs.total.name = 'cost_total' # Classification error mr = MisclassificationRateIV(oos_thr=self.p.oos_thr) self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.) self.error.clean.name = 'error_rate_clean' oosr = OOSRateIV() self.oos.clean = oosr.apply(y, clean.labeled.h[top]) * np.float32(100.) self.oos.clean.name = 'oos_rate_clean'
#lstm.weights_init = Orthogonal() lstm.biases_init = Constant(0.) lstm.initialize() #ComputationGraph(encode.apply(x)).get_theano_function()(features_test)[0].shape #ComputationGraph(lstm.apply(encoded)).get_theano_function()(features_test) #ComputationGraph(decode.apply(hiddens[-1])).get_theano_function()(features_test)[0].shape #ComputationGraph(SquaredError().apply(y, y_hat.flatten())).get_theano_function()(features_test, targets_test)[0].shape encoded = encode.apply(x) #hiddens = lstm.apply(encoded, gates.apply(x)) hiddens = lstm.apply(encoded) y_hat = decode.apply(hiddens[-1]) cost = SquaredError().apply(y, y_hat) cost.name = 'cost' #ipdb.set_trace() #ComputationGraph(y_hat).get_theano_function()(features_test)[0].shape #ComputationGraph(cost).get_theano_function()(features_test, targets_test)[0].shape cg = ComputationGraph(cost) #cg = ComputationGraph(hiddens).get_theano_function() #ipdb.set_trace() algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule( [StepClipping(5.0),