def _create_model(with_dropout): cg = ComputationGraph(gan.compute_losses(x, z)) if with_dropout: inputs = VariableFilter(bricks=gan.discriminator.children[1:], roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.5) inputs = VariableFilter(bricks=[gan.discriminator], roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.2) return Model(cg.outputs)
def _create_model(with_dropout): cg = ComputationGraph(gan.compute_losses(x, z)) if with_dropout: inputs = VariableFilter( bricks=gan.discriminator.children[1:], roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.5) inputs = VariableFilter( bricks=[gan.discriminator], roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.2) return Model(cg.outputs)
def _create_model(with_dropout): cg = ComputationGraph(ali.compute_losses(x, z)) if with_dropout: inputs = VariableFilter( bricks=([ali.discriminator.x_discriminator.layers[0], ali.discriminator.z_discriminator.layers[0]]), roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.2) inputs = VariableFilter( bricks=(ali.discriminator.x_discriminator.layers[2::3] + ali.discriminator.z_discriminator.layers[2::2] + ali.discriminator.joint_discriminator.layers[::2]), roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.5) return Model(cg.outputs)
def train_base_model(self, train_data, test_data, input_dim): x = T.matrix('features') y = T.matrix('targets') mlp, cost, mis_cost = self.create_base_model(x, y, input_dim) cg = ComputationGraph([cost]) inputs = VariableFilter(roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.2) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam(learning_rate=0.001)) data_stream = train_data data_stream_test = test_data monitor = DataStreamMonitoring(variables=[mis_cost], data_stream=data_stream_test, prefix="test") plot_ext = Plot('F1-measure', channels=[['test_MisclassificationRate']], after_batch=True) main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=[ monitor, FinishAfter(after_n_epochs=50), Printing(), plot_ext ]) main_loop.run() return mlp
def build_model(images, labels): vgg = VGG(layer='conv4_4') vgg.push_initialization_config() vgg.initialize() tdb = top_direction_block() tdb.push_initialization_config() tdb.initialize() # Construct feedforward sequence ss_seq = FeedforwardSequence([vgg.apply, tdb.apply]) ss_seq.push_initialization_config() ss_seq.initialize() prediction = ss_seq.apply(images) cost = StructuredCost().apply(labels, theano.tensor.clip(prediction, 1e-5, 1 - 1e-5)) cg = ComputationGraph(cost) cg_dropout = apply_dropout(cg, [VariableFilter(roles=[OUTPUT])(cg.variables)[0]], .5) cost_dropout = cg_dropout.outputs[0] # define learned parameters selector = Selector([ss_seq]) W = selector.get_parameters() parameters = [] parameters += [v for k, v in W.items()] return cost_dropout, parameters
def build_mlp(features_cat, features_int, labels): mlp_int = MLP(activations=[Rectifier(), Rectifier()], dims=[19, 50, 50], weights_init=IsotropicGaussian(), biases_init=Constant(0), name='mlp_interval') mlp_int.initialize() mlp_cat = MLP(activations=[Logistic()], dims=[320, 50], weights_init=IsotropicGaussian(), biases_init=Constant(0), name='mlp_categorical') mlp_cat.initialize() mlp = MLP(activations=[Rectifier(), None], dims=[50, 50, 1], weights_init=IsotropicGaussian(), biases_init=Constant(0)) mlp.initialize() gated = mlp_cat.apply(features_cat) * mlp_int.apply(features_int) prediction = mlp.apply(gated) cost = MAPECost().apply(prediction, labels) cg = ComputationGraph(cost) print cg.variables cg_dropout1 = apply_dropout(cg, [VariableFilter(roles=[OUTPUT])(cg.variables)[1], VariableFilter(roles=[OUTPUT])(cg.variables)[3]], .2) cost_dropout1 = cg_dropout1.outputs[0] return cost_dropout1, cg_dropout1.parameters, cost
def apply_dropout(self, dropout, variables=None): if dropout and dropout > 0: if variables == None: var_filter = VariableFilter(theano_name_regex='linear.*input_') variables = var_filter(self.cg.variables) self.cg = apply_dropout(self.cg, variables, dropout) self._cost = self.cg.outputs[0]
def build_mlp(features_int, features_cat, labels, labels_mean): inputs = tensor.concatenate([features_int, features_cat], axis=1) mlp = MLP(activations=[Rectifier(), Rectifier(), Rectifier(), None], dims=[337, 800, 1200, 1], weights_init=IsotropicGaussian(), biases_init=Constant(1)) mlp.initialize() prediction = mlp.apply(inputs) cost = MAPECost().apply(prediction, labels, labels_mean) cg = ComputationGraph(cost) #cg_dropout0 = apply_dropout(cg, [VariableFilter(roles=[INPUT])(cg.variables)[1]], .2) cg_dropout1 = apply_dropout(cg, [ VariableFilter(roles=[OUTPUT])(cg.variables)[1], VariableFilter(roles=[OUTPUT])(cg.variables)[3], VariableFilter(roles=[OUTPUT])(cg.variables)[5] ], .2) cost_dropout1 = cg_dropout1.outputs[0] return cost_dropout1, cg_dropout1.parameters, cost #cost, cg.parameters, cost #
def build_mlp(features_car_cat, features_car_int, features_nocar_cat, features_nocar_int, features_cp, features_hascar, means, labels): features = tensor.concatenate([ features_hascar, means['cp'][features_cp[:, 0]], means['dep'][features_cp[:, 1]] ], axis=1) mlp = MLP(activations=[Rectifier(), Rectifier(), None], dims=[5, 50, 50, 1], weights_init=IsotropicGaussian(.1), biases_init=Constant(0), name='mlp') mlp.initialize() prediction = mlp.apply(features) cost = MAPECost().apply(labels, prediction) cg = ComputationGraph(cost) input_var = VariableFilter(roles=[INPUT])(cg.variables) print input_var cg_dropout1 = apply_dropout(cg, [input_var[3], input_var[5]], .4) cost_dropout1 = cg_dropout1.outputs[0] return prediction, cost_dropout1, cg_dropout1.parameters, cost
def build_mlp(features_car_cat, features_car_int, features_nocar_cat, features_nocar_int, features_cp, features_hascar, means, labels): prediction, _, _, _, = \ build_mlp_onlyloc(features_car_cat, features_car_int, features_nocar_cat, features_nocar_int, features_cp, features_hascar, means, labels) mlp_crm = MLP(activations=[None], dims=[1, 1], weights_init=IsotropicGaussian(.1), biases_init=Constant(0), name='mlp_crm') mlp_crm.initialize() crm = features_nocar_int[:, 0][:, None] prediction = prediction * mlp_crm.apply(crm) cost = MAPECost().apply(labels, prediction) cg = ComputationGraph(cost) input_var = VariableFilter(roles=[INPUT])(cg.variables) print input_var cg_dropout = apply_dropout(cg, [input_var[7], input_var[5]], .4) cost_dropout = cg_dropout.outputs[0] return prediction, cost_dropout, cg_dropout.parameters, cost
def _apply_dropout(self, outputs, *args, **kwargs): variables = [self.word_embed.W, self.hashtag_embed.W] cgs = ComputationGraph(outputs) cg_dropouts = apply_dropout(cgs, variables, drop_prob=self.config.dropout_prob, seed=123).outputs return cg_dropouts
def build_mlp(features_car_cat, features_car_int, features_nocar_cat, features_nocar_int, features_cp, features_hascar, means, labels): mlp_car = MLP(activations=[Rectifier(), Rectifier(), None], dims=[8 + 185, 200, 200, 1], weights_init=IsotropicGaussian(.1), biases_init=Constant(0), name='mlp_interval_car') mlp_car.initialize() mlp_nocar = MLP(activations=[Rectifier(), Rectifier(), None], dims=[5 + 135, 200, 200, 1], weights_init=IsotropicGaussian(.1), biases_init=Constant(0), name='mlp_interval_nocar') mlp_nocar.initialize() feature_car = tensor.concatenate((features_car_cat, features_car_int), axis=1) feature_nocar = tensor.concatenate( (features_nocar_cat, features_nocar_int), axis=1) prediction = mlp_nocar.apply(feature_nocar) # gating with the last feature : does the dude own a car prediction += tensor.addbroadcast(features_hascar, 1) * mlp_car.apply(feature_car) prediction_loc, _, _, _, = \ build_mlp_onlyloc(features_car_cat, features_car_int, features_nocar_cat, features_nocar_int, features_cp, features_hascar, means, labels) prediction += prediction_loc # add crm mlp_crm = MLP(activations=[None], dims=[1, 1], weights_init=IsotropicGaussian(.1), biases_init=Constant(0), name='mlp_crm') mlp_crm.initialize() crm = features_nocar_int[:, 0][:, None] prediction = prediction * mlp_crm.apply(crm) cost = MAPECost().apply(labels, prediction) cg = ComputationGraph(cost) input_var = VariableFilter(roles=[INPUT])(cg.variables) print input_var cg_dropout1 = apply_dropout(cg, [input_var[6], input_var[7]], .4) cost_dropout1 = cg_dropout1.outputs[0] return prediction, cost_dropout1, cg_dropout1.parameters, cost
def test_apply_dropout_custom_divisor(): x = tensor.vector() y = tensor.vector() z = x - y cg = ComputationGraph([z]) scaled_dropped_cg = apply_dropout(cg, [y], 0.8, seed=2, custom_divisor=2.5) x_ = numpy.array([9., 8., 9.], dtype=theano.config.floatX) y_ = numpy.array([4., 5., 6.], dtype=theano.config.floatX) assert_allclose( scaled_dropped_cg.outputs[0].eval({x: x_, y: y_}), x_ - (y_ * MRG_RandomStreams(2).binomial((3,), p=0.2).eval() / 2.5))
def test_apply_dropout(): x = tensor.vector() y = tensor.vector() z = x * y cg = ComputationGraph([z]) dropped_cg = apply_dropout(cg, [x], 0.4, seed=1) x_ = numpy.array([5., 6., 7.], dtype=theano.config.floatX) y_ = numpy.array([1., 2., 3.], dtype=theano.config.floatX) assert_allclose( dropped_cg.outputs[0].eval({x: x_, y: y_}), x_ * y_ * MRG_RandomStreams(1).binomial((3,), p=0.6).eval() / 0.6)
def dropout(cg): """Create dropout computation graph. Parameters ---------- cg : ComputationGraph origin computation graph Returns ------- dropout_cg : ComputationGraph dropped out computation graph """ inputs = VariableFilter(roles=[INPUT])(cg.variables) dropout_cg = apply_dropout(cg, inputs, 0.5) return dropout_cg
def create_training_computation_graphs(): x = tensor.tensor4('features') y = tensor.imatrix('targets') convnet, mlp = create_model_bricks() y_hat = mlp.apply(convnet.apply(x).flatten(ndim=2)) cost = BinaryCrossEntropy().apply(y, y_hat) accuracy = 1 - tensor.neq(y > 0.5, y_hat > 0.5).mean() cg = ComputationGraph([cost, accuracy]) # Create a graph which uses batch statistics for batch normalization # as well as dropout on selected variables bn_cg = apply_batch_normalization(cg) bricks_to_drop = ([convnet.layers[i] for i in (5, 11, 17)] + [mlp.application_methods[1].brick]) variables_to_drop = VariableFilter( roles=[OUTPUT], bricks=bricks_to_drop)(bn_cg.variables) bn_dropout_cg = apply_dropout(bn_cg, variables_to_drop, 0.5) return cg, bn_dropout_cg
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply( attention_clinear.apply( cenc.reshape((cenc.shape[0] * cenc.shape[1], cenc.shape[2] ))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply( layer1.reshape( (layer1.shape[0] * layer1.shape[1], layer1.shape[2]))) att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights = tensor.nnet.sigmoid(att_weights.T).T att_weights.name = 'att_weights' att_target = tensor.eq( tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)), tensor.tile(context[:, None, :], (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1) cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() self.predictions = tensor.gt(att_weights, 0.1) * context # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') context_bag = tensor.eq(context[:, :, None], tensor.arange(vocab_size)).sum(axis=1).clip( 0, 1) bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') # embed.weights_init = Constant(embeddings_initial_value) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' #embed size: 200, lstm_size = 256 #qenc: length * batch_size * (2*lstm_size) # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate( [ cembed, tensor.extra_ops.repeat( qenc[None, :, :], cembed.shape[0], axis=0) ], axis=2 ) #length * batch_size * (embed+2*lstm_size) this is what goes into encoder clstms, chidden_list = make_bidir_lstm_stack( cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' #cenc: length * batch_size * (2*lstm_size) #pointer networks decoder LSTM and Attention parameters params = init_params(data_dim=config.decoder_data_dim, lstm_dim=config.decoder_lstm_output_dim) tparams = init_tparams(params) self.theano_params = [] add_role(tparams['lstm_de_W'], WEIGHT) add_role(tparams['lstm_de_U'], WEIGHT) add_role(tparams['lstm_de_b'], BIAS) add_role(tparams['ptr_v'], WEIGHT) add_role(tparams['ptr_W1'], WEIGHT) add_role(tparams['ptr_W2'], WEIGHT) self.theano_params = tparams.values() # for p in tparams.values(): # add_role(p, WEIGHT) # self.theano_params.append(p) #n_steps = length , n_samples = batch_size n_steps = ans_indices.shape[0] n_samples = ans_indices.shape[1] preds, generations = ptr_network( tparams, cqembed, context_mask.astype(theano.config.floatX), ans_indices, ans_indices_mask.astype(theano.config.floatX), config.decoder_lstm_output_dim, cenc) self.generations = generations idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples, ), dtype='int64')) idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'), tensor.arange(n_samples, dtype='int64')) probs = preds[idx_steps, ans_indices, idx_samples] # probs *= y_mask off = 1e-8 if probs.dtype == 'float16': off = 1e-6 # probs += (1 - y_mask) # change unmasked position to 1, since log(1) = 0 probs += off # probs_printed = theano.printing.Print('this is probs')(probs) cost = -tensor.log(probs) cost *= ans_indices_mask cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0) cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout, depth, max_grad, step_method, epsilon, sample, skip, uniform, top): #---------------------------------------------------------------------- datasource = name def shnum(x): """ Convert a positive float into a short tag-usable string E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2 """ return '0' if x <= 0 else '%s%d' % (("%e"%x)[0], -np.floor(np.log10(x))) jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (datasource, depth, dim, mix_dim, int(dropout*10), shnum(learning_rate), batch_size, shnum(epsilon)) if max_length != 600: jobname += '-L%d'%max_length if GRU: jobname += 'g' if max_grad != 5.: jobname += 'G%g'%max_grad if step_method != 'adam': jobname += step_method if skip: jobname += 'D' assert depth > 1 if top: jobname += 'T' assert depth > 1 if uniform > 0.: jobname += 'u%d'%int(uniform*100) if debug: jobname += ".debug" if sample: print("Sampling") else: print("\nRunning experiment %s" % jobname) if old_model_name: print("starting from model %s"%old_model_name) #---------------------------------------------------------------------- transitions = [GatedRecurrent(dim=dim) if GRU else LSTM(dim=dim) for _ in range(depth)] if depth > 1: transition = RecurrentStack(transitions, name="transition", fast=True, skip_connections=skip or top) if skip: source_names=['states'] + ['states_%d'%d for d in range(1,depth)] else: source_names=['states_%d'%(depth-1)] else: transition = transitions[0] transition.name = "transition" source_names=['states'] emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter") readout = Readout( readout_dim=emitter.get_dim('inputs'), source_names=source_names, emitter=emitter, name="readout") normal_inputs = [name for name in transition.apply.sequences if 'mask' not in name] fork = Fork(normal_inputs, prototype=Linear(use_bias=True)) generator = SequenceGenerator(readout=readout, transition=transition, fork=fork) # Initialization settings if uniform > 0.: generator.weights_init = Uniform(width=uniform*2.) else: generator.weights_init = OrthogonalGlorot() generator.biases_init = Constant(0) # Build the cost computation graph [steps, batch_size, 3] x = T.tensor3('features', dtype=floatX) if debug: x.tag.test_value = np.ones((max_length,batch_size,3)).astype(floatX) x = x[:max_length,:,:] # has to be after setting test_value cost = generator.cost(x) cost.name = "sequence_log_likelihood" # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) model_size = 0 for v in params.itervalues(): s = v.get_value().shape model_size += s[0] * (s[1] if len(s) > 1 else 1) logger.info("Total number of parameters %d"%model_size) #------------------------------------------------------------ extensions = [] if old_model_name == 'continue': extensions.append(LoadFromDump(jobname)) elif old_model_name: # or you can just load the weights without state using: old_params = LoadFromDump(old_model_name).manager.load_parameters() model.set_param_values(old_params) else: # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() if sample: assert old_model_name and old_model_name != 'continue' Sample(generator, steps=max_length, path=old_model_name).do(None) exit(0) #------------------------------------------------------------ # Define the training algorithm. cg = ComputationGraph(cost) if dropout > 0.: from blocks.roles import INPUT, OUTPUT dropout_target = VariableFilter(roles=[OUTPUT], bricks=transitions, name_regex='states')(cg.variables) print('# dropout %d' % len(dropout_target)) cg = apply_dropout(cg, dropout_target, dropout) opt_cost = cg.outputs[0] else: opt_cost = cost if step_method == 'adam': step_rule = Adam(learning_rate) elif step_method == 'rmsprop': step_rule = RMSProp(learning_rate, decay_rate=0.95) elif step_method == 'adagrad': step_rule = AdaGrad(learning_rate) elif step_method == 'adadelta': step_rule = AdaDelta() elif step_method == 'scale': step_rule = Scale(learning_rate) else: raise Exception('Unknown sttep method %s'%step_method) step_rule = CompositeRule([StepClipping(max_grad), step_rule]) algorithm = GradientDescent( cost=opt_cost, params=cg.parameters, step_rule=step_rule) #------------------------------------------------------------ observables = [cost] # Fetch variables useful for debugging (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") observables += [min_energy, max_energy] # (activations,) = VariableFilter( # applications=[generator.transition.apply], # name=generator.transition.apply.states[0])(cg.variables) # mean_activation = named_copy(abs(activations).mean(), # "mean_activation") # observables.append(mean_activation) observables += [algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy( param.norm(2), name + "_norm")) observables.append(named_copy( algorithm.gradients[param].norm(2), name + "_grad_norm")) #------------------------------------------------------------ datasource_fname = os.path.join(fuel.config.data_path, datasource, datasource+'.hdf5') train_ds = H5PYDataset(datasource_fname, #max_length=max_length, which_set='train', sources=('features',), load_in_memory=True) train_stream = DataStream(train_ds, iteration_scheme=ShuffledScheme( train_ds.num_examples, batch_size)) test_ds = H5PYDataset(datasource_fname, #max_length=max_length, which_set='test', sources=('features',), load_in_memory=True) test_stream = DataStream(test_ds, iteration_scheme=SequentialScheme( test_ds.num_examples, batch_size)) train_stream = Mapping(train_stream, _transpose) test_stream = Mapping(test_stream, _transpose) def stream_stats(ds, label): itr = ds.get_epoch_iterator(as_dict=True) batch_count = 0 examples_count = 0 for batch in itr: batch_count += 1 examples_count += batch['features'].shape[1] print('%s #batch %d #examples %d' % (label, batch_count, examples_count)) stream_stats(train_stream, 'train') stream_stats(test_stream, 'test') extensions += [Timing(every_n_batches=10), TrainingDataMonitoring( observables, prefix="train", every_n_batches=10), DataStreamMonitoring( [cost], # without dropout test_stream, prefix="test", on_resumption=True, after_epoch=False, # by default this is True every_n_batches=100), # all monitored data is ready so print it... # (next steps may take more time and we want to see the # results as soon as possible so print as soon as you can) Printing(every_n_batches=10), # perform multiple dumps at different intervals # so if one of them breaks (has nan) we can hopefully # find a model from few batches ago in the other Dump(jobname, every_n_batches=11), Dump(jobname+'.test', every_n_batches=100), Sample(generator, steps=max_length, path=jobname+'.test', every_n_batches=100), ProgressBar(), FinishAfter(after_n_epochs=epochs) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), ] if bokeh: from blocks.extensions.plot import Plot extensions.append(Plot( 'sketch', channels=[ ['cost'],])) # Construct the main loop and start training! main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions ) main_loop.run()
## initialize the model dpm = model.DiffusionModel(spatial_width, n_colors, uniform_noise=uniform_noise, **model_args) dpm.initialize() ## set up optimization features = T.matrix('features', dtype=theano.config.floatX) cost = dpm.cost(features) blocks_model = blocks.model.Model(cost) cg_nodropout = ComputationGraph(cost) if args.dropout_rate > 0: # DEBUG this triggers an error on my machine # apply dropout to all the input variables inputs = VariableFilter(roles=[INPUT])(cg_nodropout.variables) # dropconnect # inputs = VariableFilter(roles=[PARAMETER])(cg_nodropout.variables) cg = apply_dropout(cg_nodropout, inputs, args.dropout_rate) else: cg = cg_nodropout step_compute = RMSProp(learning_rate=args.lr, max_scaling=1e10) algorithm = GradientDescent(step_rule=CompositeRule([RemoveNotFinite(), step_compute]), parameters=cg.parameters, cost=cost) extension_list = [] extension_list.append( SharedVariableModifier(step_compute.learning_rate, extensions.decay_learning_rate, after_batch=False, every_n_batches=batches_per_epoch, )) extension_list.append(FinishAfter(after_n_epochs=100001)) ## logging of test set performance
predict = top_mlp.apply(conv_out) # --------------------------------------------------------------- # Building computational graph # --------------------------------------------------------------- cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) inputs = VariableFilter(roles=[INPUT])(cg.variables) linear_inputs_index = [-10,-8,6] linear_inputs = list(itemgetter(*linear_inputs_index)(inputs)) cg_dropout = apply_dropout(cg,linear_inputs, 0.5) # --------------------------------------------------------------- # Set ports listeners for Fuel data servers # --------------------------------------------------------------- data_valid_stream = ServerDataStream(('image_features','targets'), False, port=3040) data_train_stream = ServerDataStream(('image_features','targets'), False, port=3041) # --------------------------------------------------------------- # Training settings # --------------------------------------------------------------- #algorithm = GradientDescent(cost=cost, parameters=cg_dropout.parameters, step_rule=Adam()) algorithm = GradientDescent(cost=cost, parameters=cg_dropout.parameters, step_rule=Scale(learning_rate=learning_rate)) save_to = 'Glorot__overfeat_4conv_1full_bn.pkl'
def run_training(config, tr_stream, dev_stream=None, use_bokeh=True): # Monitoring extensions try: from blocks_extras.extensions.plot import Plot BOKEH_AVAILABLE = True except ImportError: BOKEH_AVAILABLE = False print('Bokeh avalablity: ' + str(BOKEH_AVAILABLE)) logger = logging.getLogger(__name__) # Create Theano variables logger.info('Creating theano variables') x = T.tensor3('features', dtype=config.params['data_dtype']) x_mask = T.tensor3('features_mask', dtype=config.params['mask_dtype']) y = T.matrix('targets', dtype=config.params['data_dtype']) y_mask = T.matrix('targets_mask', dtype=config.params['mask_dtype']) # Construct model logger.info('Building baseline model') baseline_model = BaselineModel(config.params) baseline_model.initialize() cost = baseline_model.cost(subword_id_input_=x, subword_id_input_mask_=x_mask, subword_id_target_=y, subword_id_target_mask_=y_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # apply dropout for regularization if config.params['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] print(cg.intermediary_variables) print(cg.variables) print(cg.inputs) print(cg.parameters) print(dropout_inputs) cg = apply_dropout(cg, dropout_inputs, config.params['dropout']) logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config.params['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True) #CheckpointNMT(config['saveto'], every_n_batches=config['save_freq'])] ] # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Baseline model', channels=[['baselinemodel_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config.params['step_clipping']), eval(config.params['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=baseline_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train main_loop.run() print('DONE TRAINING')
def main(job_id, params, config_file='params.ec'): config = ConfigParser.ConfigParser() config.readfp(open('./configs/{}'.format(config_file))) pr = pprint.PrettyPrinter(indent=4) pr.pprint(config) net_name = config.get('hyperparams', 'net_name', 'adni') struct_name = net_name.split('_')[0] max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float(config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') input_dim = input_dims[struct_name] # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) else: solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[ input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent( step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream( dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm)], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream( dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring( variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme( test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring( variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('{}_{}'.format(net_name, side), channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}/{}/{}'.format(struct_name, side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit: # And by that I mean if the means of the val error and training error over the # previous 'epochs' is greater than the 'threshold', we are overfitting. early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.05, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop( data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Get training and development set streams tr_stream = get_tr_stream(**config) dev_stream = get_dev_stream(**config) # Get cost of the model cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get test set stream test_stream = get_dev_stream( config['test_set'], config['src_vocab'], config['src_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens( pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk( line[0], config['src_vocab_size'], unk_idx) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={sampling_input: input_}, max_length=3*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close()
train_stream = stream.train(req_vars) valid_stream = stream.valid(req_vars) cost = model.cost(**inputs) cg = ComputationGraph(cost) monitored = set([cost] + VariableFilter(roles=[roles.COST])(cg.variables)) valid_monitored = monitored if hasattr(model, 'valid_cost'): valid_cost = model.valid_cost(**inputs) valid_cg = ComputationGraph(valid_cost) valid_monitored = set([valid_cost] + VariableFilter(roles=[roles.COST])(valid_cg.variables)) if hasattr(config, 'dropout') and config.dropout < 1.0: cg = apply_dropout(cg, config.dropout_inputs(cg), config.dropout) if hasattr(config, 'noise') and config.noise > 0.0: cg = apply_noise(cg, config.noise_inputs(cg), config.noise) cost = cg.outputs[0] cg = Model(cost) logger.info('# Parameter shapes:') parameters_size = 0 for value in cg.parameters: logger.info(' %20s %s' % (value.get_value().shape, value.name)) parameters_size += reduce(operator.mul, value.get_value().shape, 1) logger.info('Total number of parameters: %d in %d matrices' % (parameters_size, len(cg.parameters))) if hasattr(config, 'step_rule'): step_rule = config.step_rule else:
cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, dropout_inputs, dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm: algo = GradientDescent( step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # Data stream used for training model: training_stream = Flatten( DataStream.default_stream( dataset=train, iteration_scheme=ShuffledScheme( train.num_examples,
def main(save_to, num_epochs): mlp = MLP([Tanh(), Tanh(), Softmax()], [784, 100, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost, error_rate]) cost.name = 'final_cost' test_cost = cost for_dropout = VariableFilter(roles=[INPUT], bricks=mlp.linear_transformations[1:])(cg.variables) dropout_graph = apply_dropout(cg, for_dropout, 0.5) dropout_graph = apply_dropout(dropout_graph, [x], 0.1) dropout_cost, dropout_error_rate = dropout_graph.outputs mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) algorithm = GradientDescent( cost=dropout_cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), TrainingDataMonitoring( [dropout_cost, dropout_error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(dropout_cost), extensions=extensions) main_loop.run()
def main_run(_config, _log): from collections import namedtuple c = namedtuple("Config", _config.keys())(*_config.values()) _log.info("Running with" + str(_config)) import theano from theano import tensor as T import numpy as np from dataset import IMDBText, GloveTransformer from blocks.initialization import Uniform, Constant, IsotropicGaussian, NdarrayInitialization, Identity, Orthogonal from blocks.bricks.recurrent import LSTM, SimpleRecurrent, GatedRecurrent from blocks.bricks.parallel import Fork from blocks.bricks import Linear, Sigmoid, Tanh, Rectifier from blocks import bricks from blocks.extensions import Printing, Timing from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.extensions.plot import Plot from plot import PlotHistogram from blocks.algorithms import GradientDescent, Adam, Scale, StepClipping, CompositeRule, AdaDelta from blocks.graph import ComputationGraph, apply_dropout from blocks.main_loop import MainLoop from blocks.model import Model from cuboid.algorithms import AdaM, NAG from cuboid.extensions import EpochProgress from fuel.streams import DataStream, ServerDataStream from fuel.transformers import Padding from fuel.schemes import ShuffledScheme from Conv1D import Conv1D, MaxPooling1D from schemes import BatchwiseShuffledScheme from bricks import WeightedSigmoid, GatedRecurrentFull from multiprocessing import Process import fuel import logging from initialization import SumInitialization from transformers import DropSources global train_p global test_p x = T.tensor3("features") # m = T.matrix('features_mask') y = T.imatrix("targets") # x = x+m.mean()*0 dropout_variables = [] embedding_size = 300 glove_version = "glove.6B.300d.txt" # embedding_size = 50 # glove_version = "vectors.6B.50d.txt" gloveMapping = Linear( input_dim=embedding_size, output_dim=c.rnn_input_dim, weights_init=Orthogonal(), # weights_init = IsotropicGaussian(c.wstd), biases_init=Constant(0.0), name="gloveMapping", ) gloveMapping.initialize() o = gloveMapping.apply(x) o = Rectifier(name="gloveRec").apply(o) dropout_variables.append(o) summed_mapped_glove = o.sum(axis=1) # take out the sequence glove_out = Linear( input_dim=c.rnn_input_dim, output_dim=1.0, weights_init=IsotropicGaussian(c.wstd), biases_init=Constant(0.0), name="mapping_to_output", ) glove_out.initialize() deeply_sup_0 = glove_out.apply(summed_mapped_glove) deeply_sup_probs = Sigmoid(name="deeply_sup_softmax").apply(deeply_sup_0) input_dim = c.rnn_input_dim hidden_dim = c.rnn_dim gru = GatedRecurrentFull( hidden_dim=hidden_dim, activation=Tanh(), # activation=bricks.Identity(), gate_activation=Sigmoid(), state_to_state_init=SumInitialization([Identity(1.0), IsotropicGaussian(c.wstd)]), state_to_reset_init=IsotropicGaussian(c.wstd), state_to_update_init=IsotropicGaussian(c.wstd), input_to_state_transform=Linear( input_dim=input_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(c.wstd), biases_init=Constant(0.0), ), input_to_update_transform=Linear( input_dim=input_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(c.wstd), # biases_init=Constant(-2.0)), biases_init=Constant(-1.0), ), input_to_reset_transform=Linear( input_dim=input_dim, output_dim=hidden_dim, weights_init=IsotropicGaussian(c.wstd), # biases_init=Constant(-3.0)) biases_init=Constant(-2.0), ), ) gru.initialize() rnn_in = o.dimshuffle(1, 0, 2) # rnn_in = o # rnn_out = gru.apply(rnn_in, mask=m.T) rnn_out = gru.apply(rnn_in) state_to_state = gru.rnn.state_to_state state_to_state.name = "state_to_state" # o = rnn_out[-1, :, :] o = rnn_out[-1] # o = rnn_out[:, -1, :] # o = rnn_out.mean(axis=1) # print rnn_last_out.eval({ # x: np.ones((3, 101, 300), dtype=theano.config.floatX), # m: np.ones((3, 101), dtype=theano.config.floatX)}) # raw_input() # o = rnn_out.mean(axis=1) dropout_variables.append(o) score_layer = Linear( input_dim=hidden_dim, output_dim=1, weights_init=IsotropicGaussian(std=c.wstd), biases_init=Constant(0.0), name="linear2", ) score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) # probs = deeply_sup_probs cost = -(y * T.log(probs) + (1 - y) * T.log(1 - probs)).mean() # cost_deeply_sup0 = - (y * T.log(deeply_sup_probs) + (1-y) * T.log(1 - deeply_sup_probs)).mean() # cost += cost_deeply_sup0 * c.deeply_factor cost.name = "cost" misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5)).mean() misclassification.name = "misclassification" # print rnn_in.shape.eval( # {x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), # }) # print rnn_out.shape.eval( # {x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX), # m : np.ones((45, 111), dtype=theano.config.floatX)}) # print (m).sum(axis=1).shape.eval({ # m : np.ones((45, 111), dtype=theano.config.floatX)}) # print (m).shape.eval({ # m : np.ones((45, 111), dtype=theano.config.floatX)}) # raw_input() # ================= cg = ComputationGraph([cost]) cg = apply_dropout(cg, variables=dropout_variables, drop_prob=0.5) params = cg.parameters algorithm = GradientDescent( cost=cg.outputs[0], params=params, step_rule=CompositeRule( [ StepClipping(threshold=4), Adam(learning_rate=0.002, beta1=0.1, beta2=0.001), # NAG(lr=0.1, momentum=0.9), # AdaDelta(), ] ), ) # ======== print "setting up data" ports = { "gpu0_train": 5557, "gpu0_test": 5558, "cuda0_train": 5557, "cuda0_test": 5558, "opencl0:0_train": 5557, "opencl0:0_test": 5558, "gpu1_train": 5559, "gpu1_test": 5560, } # batch_size = 16 # batch_size = 32 batch_size = 40 def start_server(port, which_set): fuel.server.logger.setLevel("WARN") dataset = IMDBText(which_set, sorted=True) n_train = dataset.num_examples # scheme = ShuffledScheme(examples=n_train, batch_size=batch_size) scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size) stream = DataStream(dataset=dataset, iteration_scheme=scheme) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, # mask_sources=('features',) mask_sources=("features",), ) padded = DropSources(padded, ["features_mask"]) fuel.server.start_server(padded, port=port, hwm=20) train_port = ports[theano.config.device + "_train"] train_p = Process(target=start_server, args=(train_port, "train")) train_p.start() test_port = ports[theano.config.device + "_test"] test_p = Process(target=start_server, args=(test_port, "test")) test_p.start() # train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port) # test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port) train_stream = ServerDataStream(("features", "targets"), port=train_port) test_stream = ServerDataStream(("features", "targets"), port=test_port) print "setting up model" # ipdb.set_trace() n_examples = 25000 print "Batches per epoch", n_examples // (batch_size + 1) batches_extensions = 100 monitor_rate = 50 # ====== model = Model(cg.outputs[0]) extensions = [] extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring([cost, misclassification], prefix="train", every_n_batches=monitor_rate)) extensions.append( DataStreamMonitoring( [cost, misclassification], data_stream=test_stream, prefix="test", after_epoch=True, before_first_epoch=False, ) ) extensions.append(Timing()) extensions.append(Printing()) # extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True)) # extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True)) # extensions.append(PlotHistogram( # channels=['train_state_to_state'], # bins=50, # every_n_batches=30)) extensions.append( Plot( theano.config.device + "_result", channels=[["train_cost"], ["train_misclassification"]], every_n_batches=monitor_rate, ) ) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def build_submodel(input_shape, output_dim, L_dim_conv_layers, L_filter_size, L_pool_size, L_activation_conv, L_dim_full_layers, L_activation_full, L_exo_dropout_conv_layers, L_exo_dropout_full_layers, L_endo_dropout_conv_layers, L_endo_dropout_full_layers, L_border_mode=None, L_filter_step=None, L_pool_step=None): # TO DO : target size and name of the features x = T.tensor4('features') y = T.imatrix('targets') assert len(input_shape) == 3, "input_shape must be a 3d tensor" num_channels = input_shape[0] image_size = tuple(input_shape[1:]) print image_size print num_channels prediction = output_dim # CONVOLUTION output_conv = x output_dim = num_channels*np.prod(image_size) conv_layers = [] assert len(L_dim_conv_layers) == len(L_filter_size) if L_filter_step is None: L_filter_step = [None] * len(L_dim_conv_layers) assert len(L_dim_conv_layers) == len(L_pool_size) if L_pool_step is None: L_pool_step = [None] * len(L_dim_conv_layers) assert len(L_dim_conv_layers) == len(L_pool_step) assert len(L_dim_conv_layers) == len(L_activation_conv) if L_border_mode is None: L_border_mode = ["valid"] * len(L_dim_conv_layers) assert len(L_dim_conv_layers) == len(L_border_mode) assert len(L_dim_conv_layers) == len(L_endo_dropout_conv_layers) assert len(L_dim_conv_layers) == len(L_exo_dropout_conv_layers) # regarding the batch dropout : the dropout is applied on the filter # which is equivalent to the output dimension # you have to look at the dropout_rate of the next layer # that is why we need to have the first dropout value of L_exo_dropout_full_layers # the first value has to be 0.0 in this context, and we'll # assume that it is, but let's have an assert assert L_exo_dropout_conv_layers[0] == 0.0, "L_exo_dropout_conv_layers[0] has to be 0.0 in this context. There are ways to make it work, of course, but we don't support this with this scripts." # here modifitication of L_exo_dropout_conv_layers L_exo_dropout_conv_layers = L_exo_dropout_conv_layers[1:] + [L_exo_dropout_full_layers[0]] if len(L_dim_conv_layers): for (num_filters, filter_size, filter_step, pool_size, pool_step, activation_str, border_mode, dropout, index) in zip(L_dim_conv_layers, L_filter_size, L_filter_step, L_pool_size, L_pool_step, L_activation_conv, L_border_mode, L_exo_dropout_conv_layers, xrange(len(L_dim_conv_layers)) ): # convert filter_size and pool_size in tuple filter_size = tuple(filter_size) if filter_step is None: filter_step = (1, 1) else: filter_step = tuple(filter_step) if pool_size is None: pool_size = (0,0) else: pool_size = tuple(pool_size) # TO DO : leaky relu if activation_str.lower() == 'rectifier': activation = Rectifier().apply elif activation_str.lower() == 'tanh': activation = Tanh().apply elif activation_str.lower() in ['sigmoid', 'logistic']: activation = Logistic().apply elif activation_str.lower() in ['id', 'identity']: activation = Identity().apply else: raise Exception("unknown activation function : %s", activation_str) assert 0.0 <= dropout and dropout < 1.0 num_filters = num_filters - int(num_filters*dropout) print "border_mode : %s" % border_mode # filter_step # http://blocks.readthedocs.org/en/latest/api/bricks.html#module-blocks.bricks.conv kwargs = {} if filter_step is None or filter_step == (1,1): pass else: # there's a bit of a mix of names because `Convolutional` takes # a "step" argument, but `ConvolutionActivation` takes "conv_step" argument kwargs['conv_step'] = filter_step if (pool_size[0] == 0 and pool_size[1] == 0): layer_conv = ConvolutionalActivation(activation=activation, filter_size=filter_size, num_filters=num_filters, border_mode=border_mode, name="layer_%d" % index, **kwargs) else: if pool_step is None: pass else: kwargs['pooling_step'] = tuple(pool_step) layer_conv = ConvolutionalLayer(activation=activation, filter_size=filter_size, num_filters=num_filters, border_mode=border_mode, pooling_size=pool_size, name="layer_%d" % index, **kwargs) conv_layers.append(layer_conv) convnet = ConvolutionalSequence(conv_layers, num_channels=num_channels, image_size=image_size, weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name="conv_section") convnet.push_allocation_config() convnet.initialize() output_dim = np.prod(convnet.get_dim('output')) output_conv = convnet.apply(output_conv) output_conv = Flattener().apply(output_conv) # FULLY CONNECTED output_mlp = output_conv full_layers = [] assert len(L_dim_full_layers) == len(L_activation_full) assert len(L_dim_full_layers) + 1 == len(L_endo_dropout_full_layers) assert len(L_dim_full_layers) + 1 == len(L_exo_dropout_full_layers) # reguarding the batch dropout : the dropout is applied on the filter # which is equivalent to the output dimension # you have to look at the dropout_rate of the next layer # that is why we throw away the first value of L_exo_dropout_full_layers L_exo_dropout_full_layers = L_exo_dropout_full_layers[1:] pre_dim = output_dim print "When constructing the model, the output_dim of the conv section is %d." % output_dim if len(L_dim_full_layers): for (dim, activation_str, dropout, index) in zip(L_dim_full_layers, L_activation_full, L_exo_dropout_full_layers, range(len(L_dim_conv_layers), len(L_dim_conv_layers)+ len(L_dim_full_layers)) ): # TO DO : leaky relu if activation_str.lower() == 'rectifier': activation = Rectifier().apply elif activation_str.lower() == 'tanh': activation = Tanh().apply elif activation_str.lower() in ['sigmoid', 'logistic']: activation = Logistic().apply elif activation_str.lower() in ['id', 'identity']: activation = Identity().apply else: raise Exception("unknown activation function : %s", activation_str) assert 0.0 <= dropout and dropout < 1.0 dim = dim - int(dim*dropout) print "When constructing the fully-connected section, we apply dropout %f to add an MLP going from pre_dim %d to dim %d." % (dropout, pre_dim, dim) layer_full = MLP(activations=[activation], dims=[pre_dim, dim], weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name="layer_%d" % index) layer_full.initialize() full_layers.append(layer_full) pre_dim = dim for layer in full_layers: output_mlp = layer.apply(output_mlp) output_dim = L_dim_full_layers[-1] - int(L_dim_full_layers[-1]*L_exo_dropout_full_layers[-1]) # COST FUNCTION output_layer = Linear(output_dim, prediction, weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name="layer_"+str(len(L_dim_conv_layers)+ len(L_dim_full_layers)) ) output_layer.initialize() full_layers.append(output_layer) y_pred = output_layer.apply(output_mlp) y_hat = Softmax().apply(y_pred) # SOFTMAX and log likelihood y_pred = Softmax().apply(y_pred) # be careful. one version expects the output of a softmax; the other expects just the # output of the network cost = CategoricalCrossEntropy().apply(y.flatten(), y_pred) #cost = Softmax().categorical_cross_entropy(y.flatten(), y_pred) cost.name = "cost" # Misclassification error_rate_brick = MisclassificationRate() error_rate = error_rate_brick.apply(y.flatten(), y_hat) error_rate.name = "error_rate" # put names D_params, D_kind = build_params(x, T.matrix(), conv_layers, full_layers) # test computation graph cg = ComputationGraph(cost) # DROPOUT L_endo_dropout = L_endo_dropout_conv_layers + L_endo_dropout_full_layers cg_dropout = cg inputs = VariableFilter(roles=[INPUT])(cg.variables) for (index, drop_rate) in enumerate(L_endo_dropout): for input_ in inputs: m = re.match(r"layer_(\d+)_apply.*", input_.name) if m and index == int(m.group(1)): if drop_rate < 0.0001: print "Skipped applying dropout on %s because the dropout rate was under 0.0001." % input_.name break else: cg_dropout = apply_dropout(cg, [input_], drop_rate) print "Applied dropout %f on %s." % (drop_rate, input_.name) break cg = cg_dropout return (cg, error_rate, cost, D_params, D_kind)
def __init__(self, ref_data, output_dim): ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') # Construct the model j = tensor.lvector('j') x = tensor.fmatrix('x') y = tensor.ivector('y') last_outputs = [] s_dropout_vars = [] r_dropout_vars = [] i_dropout_vars = [] penalties = [] for i in range(nparts): fs = numpy.random.binomial(1, part_r_proba, size=(ref_data.shape[1],)) input_dim = int(fs.sum()) fs_sh = theano.shared(fs) r = ref_data_sh[j, :][:, fs_sh.nonzero()[0]] mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name='enc%d'%i) mlp0r = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name='dec%d'%i) mlp1 = MLP(activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name='inter_gen_%d'%i) mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp_%d'%i) encod = mlp0.apply(r) rprime = mlp0r.apply(encod) inter_weights = mlp1.apply(encod) ibias = Bias(n_inter, name='inter_bias_%d'%i) inter = ibias.apply(tensor.dot(x, inter_weights)) inter = inter_act_fun.apply(inter) out = mlp2.apply(inter) penalties.append(tensor.sqrt(((rprime - r)**2).sum(axis=1)).mean()[None]) last_outputs.append(out) r_dropout_vars.append(r) s_dropout_vars = s_dropout_vars + ( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights])) ) i_dropout_vars.append(inter) # Initialize parameters for brick in [mlp0, mlp0r, mlp1, mlp2, ibias]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() final = tensor.concatenate([x[:, :, None] for x in last_outputs], axis=2).mean(axis=2) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # apply regularization cg = ComputationGraph([cost, error_rate]) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) if s_dropout != 0: cg = apply_dropout(cg, s_dropout_vars, s_dropout) if r_dropout != 0: cg = apply_dropout(cg, r_dropout_vars, r_dropout) if i_dropout != 0: cg = apply_dropout(cg, i_dropout_vars, i_dropout) [cost_reg, error_rate_reg] = cg.outputs cost_reg = cost_reg + reconstruction_penalty * tensor.concatenate(penalties, axis=0).sum() self.cost = cost self.cost_reg = cost_reg self.error_rate = error_rate self.error_rate_reg = error_rate_reg self.pred = pred self.confidence = confidence
def __init__(self, ref_data, output_dim): input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name="ref_data") # Construct the model j = tensor.lvector("j") r = ref_data_sh[j, :] x = tensor.fmatrix("x") y = tensor.ivector("y") # input_dim must be nr mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name="e0") mlp0vs = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name="de0") mlp1 = MLP( activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name="inter_gen" ) mlp2 = MLP( activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name="end_mlp" ) encod = mlp0.apply(r) rprime = mlp0vs.apply(encod) inter_weights = mlp1.apply(encod) ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = inter_act_fun.apply(ibias.apply(tensor.dot(x, inter_weights))) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in [mlp0, mlp0vs, mlp1, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, error_rate]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list( set(VariableFilter(bricks=[Tanh], name="output")(ComputationGraph([inter_weights]))) - set([inter_weights]) ) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list( set(VariableFilter(bricks=[Tanh], name="output")(ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars) ) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, error_rate_reg] = cg.outputs # add reconstruction penalty for AE part penalty_val = tensor.sqrt(((r - rprime) ** 2).sum(axis=1)).mean() cost_reg = cost_reg + reconstruction_penalty * penalty_val self.cost = cost self.cost_reg = cost_reg self.error_rate = error_rate self.error_rate_reg = error_rate_reg self.pred = pred self.confidence = confidence
def construct_model(input_dim, out_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') nx = x.shape[0] nj = x.shape[1] # also is r.shape[0] nr = r.shape[1] # r is nj x nr # x is nx x nj # y is nx # r_rep is nx x nj x nr r_rep = r[None, :, :].repeat(axis=0, repeats=nx) # x3 is nx x nj x 1 x3 = x[:, :, None] # concat is nx x nj x (nr + 1) concat = tensor.concatenate([r_rep, x3], axis=2) # Change concat from Batch x Time x Features to T X B x F mlp_input = concat.dimshuffle(1, 0, 2) if use_ensembling: # Split time dimension into batches of size num_feats # Join that dimension with the B dimension ens_shape = (num_feats, mlp_input.shape[0]/num_feats, mlp_input.shape[1]) mlp_input = mlp_input.reshape(ens_shape + (input_dim+1,)) mlp_input = mlp_input.reshape((ens_shape[0], ens_shape[1] * ens_shape[2], input_dim+1)) mlp = MLP(dims=[input_dim+1] + mlp_hidden_dims, activations=[activation_function for _ in mlp_hidden_dims], name='mlp') lstm_bot_linear = Linear(input_dim=mlp_hidden_dims[-1], output_dim=4 * lstm_hidden_dim, name="lstm_input_linear") lstm = LSTM(dim=lstm_hidden_dim, activation=activation_function, name="hidden_recurrent") lstm_top_linear = Linear(input_dim=lstm_hidden_dim, output_dim=out_dim, name="out_linear") rnn_input = mlp.apply(mlp_input) pre_rnn = lstm_bot_linear.apply(rnn_input) states = lstm.apply(pre_rnn)[0] activations = lstm_top_linear.apply(states) if use_ensembling: activations = activations.reshape(ens_shape + (out_dim,)) # Unsplit batches (ensembling) activations = tensor.mean(activations, axis=1) # Mean over time activations = tensor.mean(activations, axis=0) cost = Softmax().categorical_cross_entropy(y, activations) pred = activations.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in (mlp, lstm_bot_linear, lstm, lstm_top_linear): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() # apply noise cg = ComputationGraph([cost, error_rate]) noise_vars = VariableFilter(roles=[WEIGHT])(cg) apply_noise(cg, noise_vars, noise_std) apply_dropout(cg, [rnn_input], dropout) [cost_reg, error_rate_reg] = cg.outputs return cost_reg, error_rate_reg, cost, error_rate
def start(self): x = T.matrix('features', config.floatX) y = T.imatrix('targets') self.x = x DIMS = [108*5, 1000, 1000, 1000, 1000, 1943] NUMS = [1, 1, 1, 1, 1, 1] FUNCS = [ Rectifier, Rectifier, Rectifier, Rectifier, # Rectifier, # Maxout(num_pieces=5), # Maxout(num_pieces=5), # Maxout(num_pieces=5), # SimpleRecurrent, # SimpleRecurrent, # SimpleRecurrent, Softmax, ] def lllistool(i, inp, func): l = Linear(input_dim=DIMS[i], output_dim=DIMS[i+1] * NUMS[i+1], weights_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), biases_init=IsotropicGaussian(std=DIMS[i]**(-0.5)), name='Lin{}'.format(i)) l.initialize() func.name='Fun{}'.format(i) if func == SimpleRecurrent: gong = func(dim=DIMS[i+1], activation=Rectifier(), weights_init=IsotropicGaussian(std=(DIMS[i]+DIMS[i+1])**(-0.5))) else: gong = func() ret = gong.apply(l.apply(inp)) return ret oup = x for i in range(len(DIMS)-1): oup = lllistool(i, oup, FUNCS[i]) y_hat = oup self.y_hat_prob = y_hat cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat).astype(config.floatX) cg = ComputationGraph(cost) orig_cg = cg ips = VariableFilter(roles=[INPUT])(cg.variables) ops = VariableFilter(roles=[OUTPUT])(cg.variables) cg = apply_dropout(cg, ips[0:2:1], 0.2) cg = apply_dropout(cg, ips[2:-2:1], 0.5) cost = cg.outputs[0] cost.name = 'cost' # mps = theano.shared(np.array([ph2id(ph48239(id2ph(t))) for t in range(48)])) mps = theano.shared(np.array([ph2id(state239(t)) for t in range(1943)])) z_hat = T.argmax(y_hat, axis=1) y39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[y.flatten()]) y_hat39,_ = scan(fn=lambda t: mps[t], outputs_info=None, sequences=[z_hat]) self.y_hat39 = y_hat39 lost01 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX) lost01.name = '0/1 loss' lost23 = (T.sum(T.neq(y_hat39, y39)) / y39.shape[0]).astype(config.floatX) #lost23 = MisclassificationRate().apply(y39, y_hat39).astype(config.floatX) lost23.name = '2/3 loss' Ws = VariableFilter(roles=[WEIGHT])(cg.variables) norms = sum(w.norm(2) for w in Ws) norms.name = 'norms' path = pjoin(PATH['fuel'], pfx+'_train.hdf5') data = H5PYDataset(path, which_set='train', load_in_memory=True, subset=slice(0, 100000)) # data = H5PYDataset(path, which_set='train', load_in_memory=True) data_v = H5PYDataset(pjoin(PATH['fuel'], pfx+'_validate.hdf5'), which_set='validate', load_in_memory=True) num = data.num_examples data_stream = DataStream(data, iteration_scheme=ShuffledScheme( num, batch_size=128)) data_stream_v = DataStream(data_v, iteration_scheme=SequentialScheme( data_v.num_examples, batch_size=128)) algo = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule([Momentum(0.002, 0.9)])) monitor = DataStreamMonitoring( variables=[cost, lost01, norms], data_stream=data_stream) monitor_v = DataStreamMonitoring( variables=[lost23], data_stream=data_stream_v) plt = Plot('AlpAlpAlp', channels=[['0/1 loss', '2/3 loss']], after_epoch=True) main_loop = MainLoop(data_stream = data_stream, algorithm=algo, extensions=[monitor, monitor_v, FinishAfter(after_n_epochs=2000), Printing(), plt]) main_loop.run()
def train(args, trial=11, no_valid=False): # Creating unique strings to save for experiments. data_valid = "data/"+args.data_name+"_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) data_test = data_valid.replace("_valid_size", "_test_size") # If we want validation set to match modData of test set if modDataValid == 1: data_valid = data_valid.replace("_trial_", "_" + modData + "_trial_") data_test = data_test.replace("_trial_", "_" + modData + "_trial_") # By default, it is m0 data_train = "data/"+args.data_name+"_trial_"+str(trial)+"_train_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) subStr = "rnn_type_"+args.rnn_type + "_trial_"+str(trial) + "_hiddenSize_"+str(args.hidden_size)+\ "_numLayers_"+str(args.num_layers)+ \ "_dropout_"+str(args.dropout)+"_train_size_"+str(args.train_size) + "_transitions_"+str(args.transitions)+\ "_novalid_"+str(args.no_valid) if modData == "m1": data_train = data_train.replace("_trial_", "_m1_trial_") subStr = subStr.replace("_trial_", "_m1_trial_") elif modData == "m3": data_train = data_train.replace("_trial_", "_m3_trial_") subStr = subStr.replace("_trial_", "_m3_trial_") data_valid = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) data_test = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_test_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) print("on test: " + subStr) # Perform folder prefixing prefix_path = models_folder + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\ "_boost_"+bStr(args.boosting) load_path2 = prefix + load_path save_path2 = prefix + save_path last_path2 = prefix + last_path plots_output2 = plots_output + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\ "_boost_"+bStr(args.boosting) # obtain vocabulary size ix_to_char, char_to_ix, vocab_size = get_metadata( data_test.replace("_test", "")) print("vocab_size: " + str(vocab_size)) # Get train, valid, test streams sharedDataTrain, train_stream = get_stream_inGPU(data_train, sharedName='sharedData') train_streamCopy = copy.deepcopy(train_stream) sharedDataValid, dev_stream = get_stream_inGPU(data_valid, sharedName='sharedData') valid_streamCopy = copy.deepcopy(dev_stream) sharedDataTest, test_stream = get_stream_inGPU(data_test, sharedName='sharedData') test_streamCopy = copy.deepcopy(test_stream) # Create dummy sums sharedMRRSUM = shared(np.array(0.0, dtype=theano.config.floatX)) sharedTOTSUM = shared(np.array(0.0, dtype=theano.config.floatX)) sharedSUMVARs = { 'sharedMRRSUM': sharedMRRSUM, 'sharedTOTSUM': sharedTOTSUM } # Initialize batches batch_index_From = T.scalar('int_stream_From', dtype='int32') batch_index_To = T.scalar('int_stream_To', dtype='int32') # Index theano variables x = sharedDataTrain['x'][:, batch_index_From:batch_index_To] x.name = 'x' x_mask = sharedDataTrain['x_mask'][:, batch_index_From:batch_index_To] x_mask.name = 'x_mask' x_mask_o = sharedDataTrain['x_mask_o'][:, batch_index_From:batch_index_To] x_mask_o.name = 'x_mask_o' x_mask_o_mask = sharedDataTrain[ 'x_mask_o_mask'][:, batch_index_From:batch_index_To] x_mask_o_mask.name = 'x_mask_o_mask' y = sharedDataTrain['y'][:, batch_index_From:batch_index_To] y.name = 'y' y_mask = sharedDataTrain['y_mask'][:, batch_index_From:batch_index_To] y_mask.name = 'y_mask' y_mask_o = sharedDataTrain['y_mask_o'][:, batch_index_From:batch_index_To] y_mask_o.name = 'y_mask_o' y_mask_o_mask = sharedDataTrain[ 'y_mask_o_mask'][:, batch_index_From:batch_index_To] y_mask_o_mask.name = 'y_mask_o_mask' lens = sharedDataTrain['lens'][:, batch_index_From:batch_index_To] lens.name = 'lens' # Generate temp shared vars tempSharedData = {} tempSharedData[theano.config.floatX] = [ shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)) ] tempSharedData['uint8'] = [ shared(np.array([[0], [0]], dtype='uint8')), shared(np.array([[0], [0]], dtype='uint8')), shared(np.array([[0], [0]], dtype='uint8')) ] # Final mask is due to the generated mask and the input mask x_mask_final = x_mask * x_mask_o * x_mask_o_mask y_mask_final = y_mask * y_mask_o * y_mask_o_mask # Build neural network linear_output, cost = nn_fprop( x, x_mask_final, y, y_mask_final, lens, vocab_size, hidden_size, num_layers, rnn_type, boosting=boosting, scan_kwargs={'truncate_gradient': truncate_gradient}) # Keep a constant in gpu memory constant1 = shared(np.float32(1.0)) cost_int, ymasksum = RR_cost(y, linear_output, y_mask_final, constant1) # Validation calculations fRR = function(inputs=[ theano.In(batch_index_From, borrow=True), theano.In(batch_index_To, borrow=True) ], updates=[(sharedMRRSUM, sharedMRRSUM + cost_int), (sharedTOTSUM, sharedTOTSUM + ymasksum)]) # COST cg = ComputationGraph(cost) if dropout > 0: # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015) inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')( cg.variables) cg = apply_dropout(cg, inputs, dropout) cost = cg.outputs[0] # Learning algorithm step_rules = [ RMSProp(learning_rate=rmsPropLearnRate, decay_rate=decay_rate), StepClipping(step_clipping) ] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules)) # Extensions # This is for tracking our best result trackbest = track_best('valid_MRR', save_path2, last_path2, num_epochs, nepochs, maxIterations, epsilon, tempSharedData) if onlyPlots: prefixes = ["train_cross", "valid_cross", "test_cross"] gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] #this is faster train_monitor = myTrainingDataMonitoring( variables=monitored_vars, prefix=prefixes[0], after_batch=True, saveEveryXIteration=saveEveryXIteration) #train_monitor = DataStreamMonitoringPlot(variables=[cost], # data_stream=train_streamCopy, prefix=prefixes[0], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTrain, after_batch=True, saveEveryXIteration = saveEveryXIteration) valid_monitor = DataStreamMonitoringPlot( variables=[cost], data_stream=valid_streamCopy, prefix=prefixes[1], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataValid, after_batch=True, saveEveryXIteration=saveEveryXIteration) test_monitor = DataStreamMonitoringPlot( variables=[cost], data_stream=test_streamCopy, prefix=prefixes[2], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTest, after_batch=True, saveEveryXIteration=saveEveryXIteration) trackbest = [trackbest[0], trackbest[2], trackbest[3], trackbest[4]] plot = Plot('Live Plotting', saveFolder=plots_output2, channels=[ 'train_cross_cost', 'valid_cross_cost', 'test_cross_cost' ], numProcesses=numProcesses, saveEveryXIteration=saveEveryXIteration, after_batch=True) extensions = [ train_monitor, valid_monitor, test_monitor, plot, Printing(), ProgressBar(), ] + trackbest else: dev_monitor = myDataStreamMonitoring(after_epoch=True, before_epoch=False, data_stream=dev_stream, prefix="valid", fRR=fRR, sharedVars=sharedSUMVARs, sharedDataTrain=sharedDataTrain, sharedDataValid=sharedDataValid) extensions = [ dev_monitor, Printing(), ProgressBar(), ] + trackbest if learning_rate_decay not in (0, 1): extensions.append( SharedVariableModifier(step_rules[0].learning_rate, lambda n, lr: np.cast[theano.config.floatX] (learning_rate_decay * lr), after_epoch=True, after_batch=False)) print 'number of parameters in the model: ' + str( T.sum([p.size for p in cg.parameters]).eval()) # Finally build the main loop and train the model main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
train_stream = get_stream(hdf5_file, 'train', batch_size) dev_stream = get_stream(hdf5_file, 'dev', batch_size) # MODEL x = tensor.matrix('features', dtype='uint8') y = tensor.matrix('targets', dtype='uint8') y_hat, cost = nn_fprop(x, y, vocab_size, hidden_size, num_layers, model) # COST cg = ComputationGraph(cost) if dropout > 0: # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015) inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(cg.variables) cg = apply_dropout(cg, inputs, dropout) cost = cg.outputs[0] # Learning algorithm step_rules = [RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules)) # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] dev_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev")
def main(): # set para config = getattr(configurations, "get_config_cs2en")() logger.info("Model options:\n{}".format(pprint.pformat(config))) tr_stream = get_tr_stream(**config) # Create Theano variables logger.info("Creating theano variables") source_sentence0 = tensor.lmatrix("source0") source_sentence_mask0 = tensor.matrix("source0_mask") target_sentence0 = tensor.lmatrix("target0") target_sentence_mask0 = tensor.matrix("target0_mask") source_sentence1 = tensor.lmatrix("source1") source_sentence_mask1 = tensor.matrix("source1_mask") target_sentence1 = tensor.lmatrix("target1") target_sentence_mask1 = tensor.matrix("target1_mask") source_sentence2 = tensor.lmatrix("source2") source_sentence_mask2 = tensor.matrix("source2_mask") target_sentence2 = tensor.lmatrix("target2") target_sentence_mask2 = tensor.matrix("target2_mask") sampling_input0 = tensor.lmatrix("input0") sampling_input1 = tensor.lmatrix("input1") sampling_input2 = tensor.lmatrix("input2") sampling_hstates0 = tensor.fmatrix("hstates0") sampling_hstates1 = tensor.fmatrix("hstates1") sampling_hstates2 = tensor.fmatrix("hstates2") sampling_lastrep0 = tensor.tensor3("lastrep0") sampling_lastrep1 = tensor.tensor3("lastrep1") hstates = theano.shared(value=numpy.zeros((config["enc_nhids"]), dtype=theano.config.floatX), name="hstates") # Get vocab sources = get_attr_rec(tr_stream, "data_stream") src_vocab = sources.data_streams[0].dataset.dictionary trg_vocab = sources.data_streams[1].dataset.dictionary # Construct model logger.info("Building PoemModel") block0 = PoemBlock(config=config, blockid="block0", name="poemblock0") block1 = PoemBlock(config=config, blockid="block1", name="poemblock1") block2 = PoemBlock(config=config, blockid="block2", name="poemblock2") cost0, hsta0, rep0 = block0.cost( source_sentence0, source_sentence_mask0, source_sentence_mask1, source_sentence_mask0, target_sentence0, target_sentence_mask0, hstates, lastrep0=None, lastrep1=None, ) cost1, hsta1, rep1 = block1.cost( source_sentence1, source_sentence_mask0, source_sentence_mask1, source_sentence_mask1, target_sentence1, target_sentence_mask1, hsta0, lastrep0=rep0, lastrep1=None, ) cost2, hsta2, rep2 = block2.cost( source_sentence2, source_sentence_mask0, source_sentence_mask1, source_sentence_mask2, target_sentence2, target_sentence_mask2, hsta1, lastrep0=rep0, lastrep1=rep1, ) cost = cost0 + cost1 + cost2 cost.name = "total_cost" logger.info("Creating computational graph") cg = ComputationGraph(cost) # Initialize model logger.info("Initializing model") block0.set_initw(IsotropicGaussian(config["weight_scale"])) block0.set_initb(Constant(0)) block0.push_initialization_config() block0.set_specialinit(Orthogonal(), Orthogonal()) block0.initialize() block1.set_initw(IsotropicGaussian(config["weight_scale"])) block1.set_initb(Constant(0)) block1.push_initialization_config() block1.set_specialinit(Orthogonal(), Orthogonal()) block1.initialize() block2.set_initw(IsotropicGaussian(config["weight_scale"])) block2.set_initb(Constant(0)) block2.push_initialization_config() block2.set_specialinit(Orthogonal(), Orthogonal()) block2.initialize() # apply dropout for regularization if config["dropout"] < 1.0: # dropout is applied to the output of maxout in ghog logger.info("Applying dropout") dropout_inputs = [x for x in cg.intermediary_variables if x.name == "maxout_apply_output"] cg = apply_dropout(cg, dropout_inputs, config["dropout"]) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(" {:15}: {}".format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names param_dict = Selector(block0).get_parameters() logger.info("Parameter names: ") for name, value in param_dict.items(): logger.info(" {:15}: {}".format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format(len(param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # logger.info(cg.auxiliary_variables) # logger.info("______________________________") """ weights = "" for va in cg.auxiliary_variables: if va.name == "sequence_generator_block0_cost_matrix_weighted_averages": weights = va weightsize = weights.shape weightsize.name = "weightsize" states = "" for va in cg.auxiliary_variables: if va.name == "sequence_generator_block0_cost_matrix_states": states = va statesize = states.shape statesize.name = "statesize" rep = "" for va in cg.auxiliary_variables: if va.name == "poemblock0_cost_block0hstatesRepeat": rep = va repsize = rep.shape repsize.name = "repsize" """ # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config["finish_after"]), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config["saveto"], every_n_batches=config["save_freq"]), ] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config["step_clipping"]), eval(config["step_rule"])()]), ) # Reload model if necessary if config["reload"]: extensions.append(LoadNMT(config["saveto"])) # Add sampling if config["hook_samples"] >= 1: logger.info("Building sampler") generated0 = block0.mygenerate(sampling_input0, sampling_hstates0) search_model0 = Model(generated0) generated1 = block1.mygenerate(sampling_input1, sampling_hstates1, sampling_lastrep0) search_model1 = Model(generated1) generated2 = block2.mygenerate(sampling_input2, sampling_hstates2, sampling_lastrep0, sampling_lastrep1) search_model2 = Model(generated2) extensions.append( Sampler( config=config, model0=search_model0, model1=search_model1, model2=search_model2, data_stream=tr_stream, hook_samples=config["hook_samples"], every_n_batches=config["sampling_freq"], src_vocab_size=config["src_vocab_size"], ) ) logger.info("End of building sampler") # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
train_stream = stream.train(req_vars) valid_stream = stream.valid(req_vars) cost = model.cost(**inputs) cg = ComputationGraph(cost) monitored = set([cost] + VariableFilter(roles=[roles.COST])(cg.variables)) valid_monitored = monitored if hasattr(model, 'valid_cost'): valid_cost = model.valid_cost(**inputs) valid_cg = ComputationGraph(valid_cost) valid_monitored = set([valid_cost] + VariableFilter( roles=[roles.COST])(valid_cg.variables)) if hasattr(config, 'dropout') and config.dropout < 1.0: cg = apply_dropout(cg, config.dropout_inputs(cg), config.dropout) if hasattr(config, 'noise') and config.noise > 0.0: cg = apply_noise(cg, config.noise_inputs(cg), config.noise) cost = cg.outputs[0] cg = Model(cost) logger.info('# Parameter shapes:') parameters_size = 0 for value in cg.parameters: logger.info(' %20s %s' % (value.get_value().shape, value.name)) parameters_size += reduce(operator.mul, value.get_value().shape, 1) logger.info('Total number of parameters: %d in %d matrices' % (parameters_size, len(cg.parameters))) if hasattr(config, 'step_rule'): step_rule = config.step_rule
def main(exp_config, source_vocab, target_vocab, dev_stream, use_bokeh=True): # def setup_model_and_stream(exp_config, source_vocab, target_vocab): # def setup_model_and_stream(exp_config, source_vocab, target_vocab): train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream = setup_model_and_stream( exp_config, source_vocab, target_vocab) cost = create_model(train_encoder, train_decoder, exp_config.get('imt_smoothing_constant', 0.005)) # Set up training model logger.info("Building model") train_model = Model(cost) # Set the parameters from a trained models (.npz file) logger.info("Loading parameters from model: {}".format( exp_config['saved_parameters'])) # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API param_values = LoadNMT.load_parameter_values( exp_config['saved_parameters'], brick_delimiter=exp_config.get('brick_delimiter', None)) LoadNMT.set_model_parameters(train_model, param_values) logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING if exp_config.get('l2_regularization', False) is True: l2_reg_alpha = exp_config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to rename the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization # Note dropout variables are hard-coded here if exp_config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, exp_config['dropout']) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(exp_config['saveto']): os.makedirs(exp_config['saveto']) # TODO: mv the actual config file once we switch to .yaml for min-risk shutil.copy(exp_config['config_file'], exp_config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=exp_config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(exp_config['saveto'], every_n_batches=exp_config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary # TODO: change the if statement here if exp_config['hook_samples'] >= 1 or exp_config['bleu_script'] is not None: logger.info("Building sampling model") search_model = Model(generated) _, samples = VariableFilter( bricks=[train_decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling -- TODO: sampling is broken for min-risk #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu # TODO: use multimodal meteor and BLEU validator # TODO: add 'validator' key to IMT config # Add early stopping based on bleu if exp_config.get('bleu_script', None) is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(theano_sampling_source_input, theano_sampling_context_input, samples=samples, config=exp_config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=exp_config['normalized_bleu'], every_n_batches=exp_config['bleu_val_freq'])) if exp_config.get('imt_f1_validation', False) is not False: logger.info("Building imt F1 validator") extensions.append( IMT_F1_Validator(theano_sampling_source_input, theano_sampling_context_input, samples=samples, config=exp_config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=exp_config['normalized_bleu'], every_n_batches=exp_config['bleu_val_freq'])) # Add early stopping based on Meteor # if exp_config.get('meteor_directory', None) is not None: # logger.info("Building meteor validator") # extensions.append( # MeteorValidator(theano_sampling_source_input, theano_sampling_context_input, # samples=samples, # config=config, # model=search_model, data_stream=dev_stream, # src_vocab=src_vocab, # trg_vocab=trg_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if exp_config['reload']: extensions.append(LoadNMT(exp_config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(exp_config['model_save_directory'], channels=[[ 'decoder_cost_cost', 'validation_set_imt_f1_score', 'validation_set_bleu_score', 'validation_set_meteor_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph # WORKING: try to catch and fix nan if exp_config['dropout'] < 1.0: if exp_config.get('nan_guard', False): from theano.compile.nanguardmode import NanGuardMode algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping( exp_config['step_clipping']), eval(exp_config['step_rule'])() ]), on_unused_sources='warn', theano_func_kwargs={ 'mode': NanGuardMode(nan_is_error=True, inf_is_error=True) }) else: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping( exp_config['step_clipping']), eval(exp_config['step_rule'])() ]), on_unused_sources='warn') else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping( exp_config['step_clipping']), eval(exp_config['step_rule'])() ]), on_unused_sources='warn') # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=train_model, algorithm=algorithm, data_stream=masked_stream, extensions=extensions) # Train! main_loop.run()
def train(config, save_path, bokeh_name, params, bokeh_server, test_tag, use_load_ext, load_log, fast_start, validation_epochs, validation_batches, per_epochs, per_batches): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) # Build the main brick and initialize all parameters. recognizer = SpeechRecognizer( data.recordings_source, data.labels_source, data.eos_label, data.num_features, data.num_labels, name="recognizer", data_prepend_eos=data.prepend_eos, character_map=data.character_map, **config["net"]) for brick_path, attribute_dict in sorted( config['initialization'].items(), key=lambda (k, v): -k.count('/')): for attribute, value in attribute_dict.items(): brick, = Selector(recognizer).select(brick_path).bricks setattr(brick, attribute, value) brick.push_initialization_config() recognizer.initialize() # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) if params: logger.info("Load parameters from " + params) recognizer.load_params(params) if test_tag: tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__ __stream = data.get_stream("train") __data = next(__stream.get_epoch_iterator(as_dict=True)) recognizer.recordings.tag.test_value = __data[data.recordings_source] recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask'] recognizer.labels.tag.test_value = __data[data.labels_source] recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask'] theano.config.compute_test_value = 'warn' batch_cost = recognizer.get_cost_graph().sum() batch_size = named_copy(recognizer.recordings.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output, = VariableFilter( applications=[r.bottom.apply], name="output")( cost_cg) attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = named_copy(r.recordings.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = named_copy(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = named_copy(attended.shape[0], "max_attended_length") max_num_phonemes = named_copy(r.labels.shape[0], "max_num_phonemes") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_attended = named_copy(abs(attended).mean(), "mean_attended") mean_bottom_output = named_copy(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = named_copy(monotonicity_penalty(weights, r.labels_mask), "weights_penalty") weights_entropy = named_copy(entropy(weights, r.labels_mask), "weights_entropy") mask_density = named_copy(r.labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) regularized_cost = regularized_cg.outputs[0] regularized_weights_penalty = regularized_cg.outputs[1] # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(regularized_cost) params = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, params[key].get_value().shape) for key in sorted(params.keys())], width=120)) # Define the training algorithm. train_conf = config['training'] clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False): logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] algorithm = GradientDescent( cost=regularized_cost + reg_config.get("penalty_coof", .0) * regularized_weights_penalty / batch_size + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2, parameters=params.values(), step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)])) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. observables = regularized_cg.outputs observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in params.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(named_copy(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(named_copy(aggregation.mean( var, recognizer.labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( [observables[0], algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes([cost, weights_entropy, weights_penalty]), data.get_stream("valid"), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=validation_epochs, every_n_batches=validation_batches, after_training=False) extensions.append(validation) recognizer.init_beam_search(10) per = PhonemeErrorRate(recognizer, data.get_dataset("valid")) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=per_epochs, every_n_batches=per_batches, after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_likelihood = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_likelihood, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter(data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']) .add_condition(["after_batch"], _gradient_norm_is_none), # Live plotting: requires launching `bokeh-server` # and allows to see what happens online. Plot(bokeh_name if bokeh_name else os.path.basename(save_path), [# Plot 1: training and validation costs [average_monitoring.record_name(regularized_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]], every_n_batches=10, server_url=bokeh_server), Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_likelihood.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar(), Printing(every_n_batches=1, attribute_filter=PrintingFilterList() )] # Save the config into the status log = TrainingLog() log.status['_config'] = repr(config) main_loop = MainLoop( model=model, log=log, algorithm=algorithm, data_stream=data.get_stream("train"), extensions=extensions) main_loop.run()
# COST AND ERROR MEASURE cost = Softmax().categorical_cross_entropy(label, output).mean() cost.name = 'cost' error_rate = tensor.neq(tensor.argmax(output, axis=1), label).mean() error_rate.name = 'error_rate' # REGULARIZATION cg = ComputationGraph([cost, error_rate]) if weight_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, weight_noise) if dropout > 0: cg = apply_dropout(cg, [eeg1, eeg2, data1, data2] + VariableFilter(name='output', bricks=fc.linear_transformations[:-1])(cg), dropout) # for vfilter, p in dropout_locs: # cg = apply_dropout(cg, vfilter(cg), p) [cost_reg, error_rate_reg] = cg.outputs # INITIALIZATION for brick in [conv_eeg, maxpool_eeg, conv_eeg2, maxpool_eeg2, conv, maxpool, conv2, maxpool2, fc]: brick.weights_init = weights_init brick.biases_init = biases_init brick.initialize() # ========================================================================================== # THE INFRASTRUCTURE # ==========================================================================================
def main(mode, save_to, num_epochs, load_params=None, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, stride=None, repeat_times=None, batch_size=None, num_batches=None, algo=None, test_set=None, valid_examples=None, dropout=None, max_norm=None, weight_decay=None, batch_norm=None): if feature_maps is None: feature_maps = [20, 50, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5, 5] if pool_sizes is None: pool_sizes = [2, 2, 2] if repeat_times is None: repeat_times = [1, 1, 1] if batch_size is None: batch_size = 500 if valid_examples is None: valid_examples = 2500 if stride is None: stride = 1 if test_set is None: test_set = 'test' if algo is None: algo = 'rmsprop' if batch_norm is None: batch_norm = False image_size = (128, 128) output_size = 2 if (len(feature_maps) != len(conv_sizes) or len(feature_maps) != len(pool_sizes) or len(feature_maps) != len(repeat_times)): raise ValueError("OMG, inconsistent arguments") # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, stride=stride, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), repeat_times=repeat_times, top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', batch_norm=batch_norm, weights_init=Glorot(), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) single_x = tensor.tensor3('image_features') x = tensor.tensor4('image_features') single_y = tensor.lvector('targets') y = tensor.lmatrix('targets') # Training with batch_normalization(convnet): probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) cg = ComputationGraph([cost, error_rate]) extra_updates = [] if batch_norm: # batch norm: logger.debug("Apply batch norm") pop_updates = get_batch_normalization_updates(cg) # p stands for population mean # m stands for minibatch alpha = 0.005 extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] population_statistics = [p for p, m in extra_updates] if dropout: relu_outputs = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg) cg = apply_dropout(cg, relu_outputs, dropout) cost, error_rate = cg.outputs if weight_decay: logger.debug("Apply weight decay {}".format(weight_decay)) cost += weight_decay * l2_norm(cg.parameters) cost.name = 'cost' # Validation valid_probs = convnet.apply_5windows(single_x) valid_cost = (CategoricalCrossEntropy().apply(single_y, valid_probs) .copy(name='cost')) valid_error_rate = (MisclassificationRate().apply( single_y, valid_probs).copy(name='error_rate')) model = Model([cost, error_rate]) if load_params: logger.info("Loaded params from {}".format(load_params)) with open(load_params, 'r') as src: model.set_parameter_values(load_parameters(src)) # Training stream with random cropping train = DogsVsCats(("train",), subset=slice(None, 25000 - valid_examples, None)) train_str = DataStream( train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size)) train_str = add_transformers(train_str, random_crop=True) # Validation stream without cropping valid = DogsVsCats(("train",), subset=slice(25000 - valid_examples, None, None)) valid_str = DataStream( valid, iteration_scheme=SequentialExampleScheme(valid.num_examples)) valid_str = add_transformers(valid_str) if mode == 'train': directory, _ = os.path.split(sys.argv[0]) env = dict(os.environ) env['THEANO_FLAGS'] = 'floatX=float32' port = numpy.random.randint(1025, 10000) server = subprocess.Popen( [directory + '/server.py', str(25000 - valid_examples), str(batch_size), str(port)], env=env, stderr=subprocess.STDOUT) train_str = ServerDataStream( ('image_features', 'targets'), produces_examples=False, port=port) save_to_base, save_to_extension = os.path.splitext(save_to) # Train with simple SGD if algo == 'rmsprop': step_rule = RMSProp(decay_rate=0.999, learning_rate=0.0003) elif algo == 'adam': step_rule = Adam() else: assert False if max_norm: conv_params = VariableFilter(bricks=[Convolutional], roles=[WEIGHT])(cg) linear_params = VariableFilter(bricks=[Linear], roles=[WEIGHT])(cg) step_rule = CompositeRule( [step_rule, Restrict(VariableClipping(max_norm, axis=0), linear_params), Restrict(VariableClipping(max_norm, axis=(1, 2, 3)), conv_params)]) algorithm = GradientDescent( cost=cost, parameters=model.parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(every_n_batches=100), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [valid_cost, valid_error_rate], valid_str, prefix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), TrackTheBest("valid_error_rate"), Checkpoint(save_to, save_separately=['log'], parameters=cg.parameters + (population_statistics if batch_norm else []), before_training=True, after_epoch=True) .add_condition( ['after_epoch'], OnLogRecord("valid_error_rate_best_so_far"), (save_to_base + '_best' + save_to_extension,)), Printing(every_n_batches=100)] model = Model(cost) main_loop = MainLoop( algorithm, train_str, model=model, extensions=extensions) try: main_loop.run() finally: server.terminate() elif mode == 'test': classify = theano.function([single_x], valid_probs.argmax()) test = DogsVsCats((test_set,)) test_str = DataStream( test, iteration_scheme=SequentialExampleScheme(test.num_examples)) test_str = add_transformers(test_str) correct = 0 with open(save_to, 'w') as dst: print("id", "label", sep=',', file=dst) for index, example in enumerate(test_str.get_epoch_iterator()): image = example[0] prediction = classify(image) print(index + 1, classify(image), sep=',', file=dst) if len(example) > 1 and prediction == example[1]: correct += 1 print(correct / float(test.num_examples)) else: assert False
def __init__(self, ref_data, output_dim): input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') rng = RandomStreams() ae_bricks = [] ae_input = ref_data_sh ae_costs = [] for i, (idim, odim) in enumerate(zip([input_dim] + ae_dims[:-1], ae_dims)): ae_mlp = MLP(activations=[ae_activations[i]], dims=[idim, odim], name='enc%i'%i) enc = ae_mlp.apply(ae_input) enc_n = ae_mlp.apply(ae_input + rng.normal(size=ae_input.shape, std=ae_f_noise_std)) ae_mlp_dec = MLP(activations=[ae_activations[i]], dims=[odim, idim], name='dec%i'%i) dec = ae_mlp_dec.apply(enc_n) cost = tensor.sqrt(((ae_input - dec) ** 2).sum(axis=1)).mean() + \ ae_l1_pen * abs(enc).sum(axis=1).mean() ae_costs.append(cost) ae_input = enc ae_bricks = ae_bricks + [ae_mlp, ae_mlp_dec] self.ae_costs = ae_costs ref_data_enc = ae_input # Construct the model j = tensor.lvector('j') r = ref_data_enc[j, :] x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[ae_dims[-1]] + hidden_dims + [n_inter], name='inter_gen') mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp') inter_weights = mlp.apply(r) if inter_bias == None: ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = ibias.apply(tensor.dot(x, inter_weights)) else: inter = tensor.dot(x, inter_weights) - inter_bias inter = inter_act_fun.apply(inter) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) # error_rate = tensor.neq(y, pred).mean() ber = balanced_error_rate.ber(y, pred) # Initialize parameters for brick in ae_bricks + [mlp, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, ber]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if x_dropout != 0: cg = apply_dropout(cg, [x], x_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list(set(VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights]))) - set([inter_weights])) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list(set(VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars)) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if r_noise_std != 0: cg = apply_noise(cg, [r], r_noise_std) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, ber_reg] = cg.outputs if s_l1pen != 0: s_weights = VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + s_l1pen * sum(abs(w).sum() for w in s_weights) if i_l1pen != 0: cost_reg = cost_reg + i_l1pen * abs(inter).sum() if a_l1pen != 0: a_weights = VariableFilter(bricks=mlp2.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + a_l1pen * sum(abs(w).sum() for w in a_weights) self.cost = cost self.cost_reg = cost_reg self.ber = ber self.ber_reg = ber_reg self.pred = pred self.confidence = confidence
def __init__(self): inp = tensor.tensor3('input') inp = inp.dimshuffle(1,0,2) target = tensor.matrix('target') target = target.reshape((target.shape[0],)) product = tensor.lvector('product') missing = tensor.eq(inp, 0) train_input_mean = 1470614.1 train_input_std = 3256577.0 trans_1 = tensor.concatenate((inp[1:,:,:],tensor.zeros((1,inp.shape[1],inp.shape[2]))), axis=0) trans_2 = tensor.concatenate((tensor.zeros((1,inp.shape[1],inp.shape[2])), inp[:-1,:,:]), axis=0) inp = tensor.switch(missing,(trans_1+trans_2)/2, inp) lookup = LookupTable(length = 352, dim=4*hidden_dim) product_embed= lookup.apply(product) salut = tensor.concatenate((inp, missing),axis =2) linear = Linear(input_dim=input_dim+1, output_dim=4*hidden_dim, name="lstm_in") inter = linear.apply(salut) inter = inter + product_embed[None,:,:] lstm = LSTM(dim=hidden_dim, activation=activation_function, name="lstm") hidden, cells = lstm.apply(inter) linear2= Linear(input_dim = hidden_dim, output_dim = out_dim, name="ouput_linear") pred = linear2.apply(hidden[-1])*train_input_std + train_input_mean pred = pred.reshape((product.shape[0],)) cost = tensor.mean(abs((pred-target)/target)) # Initialize all bricks for brick in [linear, linear2, lstm, lookup]: brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() # Apply noise and dropout cg = ComputationGraph([cost]) if w_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, w_noise_std) if i_dropout > 0: cg = apply_dropout(cg, [hidden], i_dropout) [cost_reg] = cg.outputs cost_reg += 1e-20 if cost_reg is not cost: self.cost = cost self.cost_reg = cost_reg cost_reg.name = 'cost_reg' cost.name = 'cost' self.sgd_cost = cost_reg self.monitor_vars = [[cost, cost_reg]] else: self.cost = cost cost.name = 'cost' self.sgd_cost = cost self.monitor_vars = [[cost]] self.pred = pred pred.name = 'pred'
def set_up_predictor(self, nmt_model_path): """Initializes the predictor with the given NMT model. Code following ``blocks.machine_translation.main``. """ self.src_vocab_size = self.config['src_vocab_size'] self.trgt_vocab_size = self.config['trg_vocab_size'] # Create Theano variables logging.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logging.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(self.config['src_vocab_size'], self.config['enc_embed'], self.config['enc_nhids']) decoder = Decoder(self.config['trg_vocab_size'], self.config['dec_embed'], self.config['dec_nhids'], self.config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logging.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model (TODO: really necessary?) logging.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( self.config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Apply dropout for regularization (TODO: remove?) if self.config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logging.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, self.config['dropout']) # Apply weight noise for regularization (TODO: remove?) if self.config['weight_noise_ff'] > 0.0: logging.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, self.config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logging.debug("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logging.debug(' {:15}: {}'.format(shape, count)) logging.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logging.debug("Parameter names: ") for name, value in enc_dec_param_dict.items(): logging.debug(' {:15}: {}'.format(value.get_value().shape, name)) logging.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logging.info("Building model") # Set extensions logging.info("Initializing extensions") # Set up beam search and sampling computation graphs if necessary logging.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Follows blocks.machine_translation.BleuValidator.__init__ self.source_sentence = sampling_input self.samples = samples self.model = search_model self.normalize = True self.verbose = self.config.get('val_set_out', None) # Reload model if necessary if self.config['reload']: loader = LoadNMT(nmt_model_path, self.config['saveto'], search_model) loader.load_weights() self.best_models = [] self.val_bleu_curve = [] self.search_algorithm = MyopticSearch(samples=samples) self.search_algorithm.compile()
def main(num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500, num_batches=None): ############# Architecture ############# if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (32, 32) batch_size = 50 output_size = 2 learningRate = 0.1 num_epochs = 10 num_batches = None delta = 0.01 drop_prob = 0.5 weight_noise = 0.75 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') probs = (convnet.apply(x)).copy(name='probs') # Computational Graph just for cost for drop_out and noise application cg_probs = ComputationGraph([probs]) inputs = VariableFilter(roles=[INPUT])(cg_probs.variables) weights = VariableFilter(roles=[FILTER, WEIGHT])(cg_probs.variables) ############# Regularization ############# #regularization = 0 logger.info('Applying regularization') regularization = delta * sum([(W**2).mean() for W in weights]) probs.name = "reg_probs" ############# Guaussian Noise ############# logger.info('Applying Gaussian noise') cg_train = apply_noise(cg_probs, weights, weight_noise) ############# Dropout ############# logger.info('Applying dropout') cg_probs = apply_dropout(cg_probs, inputs, drop_prob) dropped_out = VariableFilter(roles=[DROPOUT])(cg_probs.variables) inputs_referenced = [var.tag.replacement_of for var in dropped_out] set(inputs) == set(inputs_referenced) ############# Batch normalization ############# # recalculate probs after dropout and noise and regularization: probs = cg_probs.outputs[0] + regularization cost = (CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) cg = ComputationGraph([probs, cost, error_rate]) cg = apply_batch_normalization(cg) ########### Loading images ##################### from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift def create_data(data): stream = DataStream(data, iteration_scheme=ShuffledScheme( data.num_examples, batch_size)) stream_downscale = MinimumImageDimensions( stream, image_size, which_sources=('image_features', )) stream_rotate = Random2DRotation(stream_downscale, which_sources=('image_features', )) stream_max = ScikitResize(stream_rotate, image_size, which_sources=('image_features', )) stream_scale = ScaleAndShift(stream_max, 1. / 255, 0, which_sources=('image_features', )) stream_cast = Cast(stream_scale, dtype='float32', which_sources=('image_features', )) #stream_flat = Flatten(stream_scale, which_sources=('image_features',)) return stream_cast stream_data_train = create_data( DogsVsCats(('train', ), subset=slice(0, 20))) stream_data_test = create_data( DogsVsCats(('train', ), subset=slice(20, 30))) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learningRate)) #algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Adam(0.001)) #algorithm.add_updates(extra_updates) # `Timing` extension reports time for reading data, aggregating a batch and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [] extensions.append(Timing()) extensions.append( FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches)) extensions.append( DataStreamMonitoring([cost, error_rate], stream_data_test, prefix="valid")) extensions.append( TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True)) #extensions.append(Checkpoint(save_to)) extensions.append(ProgressBar()) extensions.append(Printing()) logger.info("Building the model") model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def main(config, tr_stream, dev_stream, use_bokeh=False, the_task=None, the_track=None): config['the_task'] = the_task # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( # end_embed is dimension of word embedding matrix in encoder; enc_nhids number of hidden units in encoder GRU config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['use_attention'], cost_type=config['error_fct']) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) testVar = decoder.getTestVar( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') my_rng = numpy.random.RandomState(config['rng_value']) if config['identity_init']: encoder.weights_init = decoder.weights_init = Identity() else: encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.rng = decoder.rng = my_rng encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() encoder.bidir.prototype.rng = my_rng decoder.transition.weights_init = Orthogonal() decoder.transition.rng = my_rng encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'], seed=my_rng) cost = cg.outputs[0] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") # this is ugly code and done, because I am not sure if the order of the extensions is important if 'track2' in config['saveto']: # less epochs for track 2, because of more data if config['early_stopping']: extensions = [ FinishAfter(after_n_epochs=config['finish_after']/2), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: extensions = [ FinishAfter(after_n_epochs=config['finish_after']/2), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: if config['early_stopping']: extensions = [ FinishAfter(after_n_epochs=config['finish_after']), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: extensions = [ FinishAfter(after_n_epochs=config['finish_after']), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], #every_n_batches=1, every_n_batches=config['sampling_freq'], src_vocab_size=8)) #src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['val_set'] is not None: logger.info("Building accuracy validator") extensions.append( AccuracyValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, after_training=True, #after_epoch=True)) every_n_epochs=5)) else: logger.info("No validation set given for this language") # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def __init__(self, ref_data, output_dim): input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') # Construct the model j = tensor.lvector('j') r = ref_data_sh[j, :] x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name='e0') mlp0vs = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name='de0') mlp1 = MLP(activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name='inter_gen') mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp') encod = mlp0.apply(r) rprime = mlp0vs.apply(encod) inter_weights = mlp1.apply(encod) ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = inter_act_fun.apply(ibias.apply(tensor.dot(x, inter_weights))) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in [mlp0, mlp0vs, mlp1, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, error_rate]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights]))) - set([inter_weights])) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars)) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, error_rate_reg] = cg.outputs # add reconstruction penalty for AE part penalty_val = tensor.sqrt(((r - rprime)**2).sum(axis=1)).mean() cost_reg = cost_reg + reconstruction_penalty * penalty_val self.cost = cost self.cost_reg = cost_reg self.error_rate = error_rate self.error_rate_reg = error_rate_reg self.pred = pred self.confidence = confidence
train_stream = get_stream(hdf5_file, 'train', batch_size) dev_stream = get_stream(hdf5_file, 'dev', batch_size) # MODEL x = tensor.matrix('features', dtype='uint8') y = tensor.matrix('targets', dtype='uint8') y_hat, cost, cells = nn_fprop(x, y, vocab_size, hidden_size, num_layers, model) # COST cg = ComputationGraph(cost) if dropout > 0: # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015) inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(cg.variables) cg = apply_dropout(cg, inputs, dropout) cost = cg.outputs[0] # Learning algorithm step_rules = [RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules)) # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] dev_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev")
def set_up(self, config=None, make_prunable=False): """Loads and initializes all the theano variables for the training model and the decoding model. Args: config (dict): NMT configuration """ if config: self.config = config else: config = self.config # Create Theano variables logging.debug('Creating theano variables') source_sentence_mask = tensor.matrix('source_mask') target_sentence_mask = tensor.matrix('target_mask') # Construct model (fs439: Add NoLookup options) if config['dec_layers'] != 1: logging.fatal("Only dec_layers=1 supported.") logging.debug('Building RNN encoder-decoder') if config['src_sparse_feat_map']: if config['enc_layers'] != 1: logging.fatal("Only enc_layers=1 supported for sparse " "source features.") source_sentence = tensor.tensor3('source') self.sampling_input = tensor.tensor3('input') encoder = NoLookupEncoder(config['enc_embed'], config['enc_nhids']) else: source_sentence = tensor.lmatrix('source') self.sampling_input = tensor.lmatrix('input') if config['enc_layers'] > 1 and not config['enc_share_weights']: encoder = DeepBidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_layers'], config['enc_skip_connections'], config['enc_nhids']) else: encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_layers'], config['enc_skip_connections'], config['enc_nhids']) if config['trg_sparse_feat_map']: target_sentence = tensor.tensor3('target') decoder = NoLookupDecoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['att_nhids'], config['maxout_nhids'], config['enc_nhids'] * 2, config['attention'], config['dec_attention_sources'], config['dec_readout_sources'], config['memory'], config['memory_size'], config['seq_len'], config['dec_init']) else: target_sentence = tensor.lmatrix('target') decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['att_nhids'], config['maxout_nhids'], config['enc_nhids'] * 2, config['attention'], config['dec_attention_sources'], config['dec_readout_sources'], config['memory'], config['memory_size'], config['seq_len'], config['dec_init'], make_prunable=make_prunable) if config['annotations'] != 'direct': annotators = [] add_direct = False for name in config['annotations'].split(','): if name == 'direct': add_direct = True elif name == 'hierarchical': annotators.append(HierarchicalAnnotator(encoder)) else: logging.fatal("Annotation strategy %s unknown" % name) encoder = EncoderWithAnnotators(encoder, annotators, add_direct) annotations, annotations_mask = encoder.apply(source_sentence, source_sentence_mask) self.cost = decoder.cost(annotations, annotations_mask, target_sentence, target_sentence_mask) logging.info('Creating computational graph') self.cg = ComputationGraph(self.cost) # Initialize model logging.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() try: encoder.bidir.prototype.weights_init = Orthogonal() except AttributeError: pass # Its fine, no bidirectional encoder decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logging.info('Applying dropout') dropout_inputs = [ x for x in self.cg.intermediary_variables if x.name == 'maxout_apply_output' ] self.cg = apply_dropout(self.cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logging.info('Applying weight noise to ff layers') if encoder.lookup: enc_params = Selector(encoder.lookup).get_parameters().values() enc_params += Selector(encoder.fwd_fork).get_parameters().values() enc_params += Selector(encoder.back_fork).get_parameters().values() dec_params = Selector( decoder.sequence_generator.readout).get_parameters().values() dec_params += Selector( decoder.sequence_generator.fork).get_parameters().values() self.cg = apply_noise(self.cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in self.cg.parameters] logging.debug("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logging.debug(' {:15}: {}'.format(shape, count)) logging.debug("Total number of CG parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logging.debug("Parameter names: ") for name, value in enc_dec_param_dict.items(): logging.debug(' {:15}: {}'.format(value.get_value().shape, name)) logging.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logging.info("Building model") self.training_model = Model(self.cost) logging.info("Building sampling model") src_shape = (self.sampling_input.shape[-2], self.sampling_input.shape[-1]) # batch_size x sen_length sampling_representation, _ = encoder.apply(self.sampling_input, tensor.ones(src_shape)) generated = decoder.generate(src_shape, sampling_representation) self.search_model = Model(generated) generated_outputs = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs self.samples = generated_outputs[1] self.encoder = encoder self.decoder = decoder
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate([ cembed, tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0) ], axis=2) clstms, chidden_list = make_bidir_lstm_stack( cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism Bilinear attention_clinear_1 = Linear(input_dim=cenc_dim, output_dim=qenc_dim, name='attc_1') bricks += [attention_clinear_1] att_start = qenc[None, :, :] * attention_clinear_1.apply( cenc.reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape( (cenc.shape[0], cenc.shape[1], cenc.shape[2])) att_start = att_start.sum(axis=2) att_start = tensor.nnet.softmax(att_start.T).T attention_clinear_2 = Linear(input_dim=cenc_dim, output_dim=qenc_dim, name='attc_2') bricks += [attention_clinear_2] att_end = qenc[None, :, :] * attention_clinear_2.apply( cenc.reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape( (cenc.shape[0], cenc.shape[1], cenc.shape[2])) att_end = att_end.sum(axis=2) att_end = tensor.nnet.softmax(att_end.T).T att_start = tensor.dot( tensor.le( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_start) att_end = tensor.dot( tensor.ge( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_end) # add attention from left and right att_weights = att_start * att_end att_target = tensor.eq( tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)), tensor.tile(context[:, None, :], (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1) self.predictions = tensor.gt(att_weights, 0.25) * context att_target = att_target / (att_target.sum(axis=0) + 0.00001) att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001) #cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() cost = (((att_weights - att_target)**2) * context_mask).sum() / context_mask.sum() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' att_start.name = 'att_start' att_end.name = 'att_end' att_weights.name = 'att_weights' att_target.name = 'att_target' self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] self.analyse_vars = [ cost, self.predictions, att_start, att_end, att_weights, att_target ] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def main(feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=None, num_batches=None): if feature_maps is None: feature_maps = [32, 48, 64, 80, 128, 128] if mlp_hiddens is None: mlp_hiddens = [1000] if conv_sizes is None: conv_sizes = [7, 5, 5, 5, 5, 4] if pool_sizes is None: pool_sizes = [3, 2, 2, 2, 2, 1] if batch_size is None: batch_size = 64 conv_steps = [2, 1, 1, 1, 1, 1] # same as stride image_size = (256, 256) output_size = 2 learningRate = 0.001 drop_prob = 0.5 weight_noise = 0.75 num_epochs = 250 num_batches = None host_plot = "http://*****:*****@ %s" % (graph_name, datetime.datetime.now(), socket.gethostname()), channels=[["train_error_rate", "valid_error_rate"], ["train_total_gradient_norm"]], after_epoch=True, server_url=host_plot, ) ) PLOT_AVAILABLE = True except ImportError: PLOT_AVAILABLE = False extensions.append(Checkpoint(save_to, after_epoch=True, after_training=True, save_separately=["log"])) logger.info("Building the model") model = Model(cost) ########### Loading images ##################### main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def main(config, tr_stream, dev_stream, source_vocab, target_vocab, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') initial_context = tensor.matrix('initial_context') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) # let user specify the target transition class name in config, # eval it and pass to decoder target_transition_name = config.get( 'target_transition', 'GRUInitialStateWithInitialStateSumContext') target_transition = eval(target_transition_name) logger.info('Using target transition: {}'.format(target_transition_name)) decoder = InitialContextDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['context_dim'], target_transition) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, initial_context) cost.name = 'decoder_cost' # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING # TODO: validate performance with/without regularization if config.get('l2_regularization', False) is True: l2_reg_alpha = config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to name the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions # TODO: add checking for existing model and loading logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Create the theano variables that we need for the sampling graph sampling_input = tensor.lmatrix('input') sampling_context = tensor.matrix('context_input') # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config.get('bleu_script', None) is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation, sampling_context) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler( model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab=source_vocab, trg_vocab=target_vocab, src_vocab_size=config['src_vocab_size'], )) # Add early stopping based on bleu if config.get('bleu_script', None) is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, sampling_context, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Add early stopping based on Meteor if config.get('meteor_directory', None) is not None: logger.info("Building meteor validator") extensions.append( MeteorValidator(sampling_input, sampling_context, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[[ 'decoder_cost', 'validation_set_bleu_score', 'validation_set_meteor_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def main(config, tr_stream, dev_stream): # Create Theano variables source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) # Initialize model encoder.weights_init = decoder.weights_init = IsotropicGaussian(config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector(decoder.sequence_generator.readout).get_params().values() dec_params += Selector(decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.transition.initial_transformer).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff']) cost = cg.outputs[0] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_params(), Selector(decoder).get_params()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.iteritems(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict))) # Set up training algorithm if args.subtensor_fix: assert config['step_rule'] == 'AdaDelta' from subtensor_gradient import GradientDescent_SubtensorFix, AdaDelta_SubtensorFix, subtensor_params lookups = subtensor_params(cg, [encoder.lookup, decoder.sequence_generator.readout.feedback_brick.lookup]) algorithm = GradientDescent_SubtensorFix( subtensor_params=lookups, cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), RemoveNotFinite(0.9), AdaDelta_SubtensorFix(subtensor_params=lookups)]) ) else: algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), RemoveNotFinite(0.9), eval(config['step_rule'])()]) ) # Set up beam search and sampling computation graphs sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) samples, = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is the next_outputs # Set up training model training_model = Model(cost) # Set extensions extensions = [ Sampler( model=search_model, config=config, data_stream=tr_stream, src_eos_idx=config['src_eos_idx'], trg_eos_idx=config['trg_eos_idx'], every_n_batches=config['sampling_freq']), BleuValidator( sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_eos_idx=config['src_eos_idx'], trg_eos_idx=config['trg_eos_idx'], every_n_batches=config['bleu_val_freq']), TrainingDataMonitoring([cost], after_batch=True), #Plot('En-Fr', channels=[['decoder_cost_cost']], # after_batch=True), Printing(after_batch=True), Dump(config['saveto'], every_n_batches=config['save_freq']) ] # Reload model if necessary if config['reload']: extensions += [LoadFromDumpWMT15(config['saveto'])] # Initialize main loop main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def __init__(self, config, vocab_size, id_to_vocab, logger): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') # question_actual = tensor.imatrix('question_actual') # context_actual = tensor.imatrix('context_actual') # answer_actual = tensor.imatrix('answer_actual') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) # Embed questions and cntext embed = LookupTable(vocab_size, config.embed_size, name='question_embed') bricks.append(embed) qembed = embed.apply(question) cembed = embed.apply(context) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + qlstms + clstms # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] #u qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' #r attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' # Now we can calculate our output out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks += [out_mlp] # g^AR probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :], tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1) probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) # Calculate prediction, cost and error rate pred = probs.argmax(axis=1) cost = Softmax().categorical_cross_entropy(answer, probs).mean() error_rate = tensor.neq(answer, pred).mean() # Apply dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg, error_rate_reg] = cg.outputs # Other stuff cost_reg.name = cost.name = 'cost' error_rate_reg.name = error_rate.name = 'error_rate' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg], [error_rate_reg]] self.monitor_vars_valid = [[cost], [error_rate]] # Initialize bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def train(save_to, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500): # Initialize the training set train = CIFAR10(("train",)) train_stream = DataStream.default_stream( train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size)) test = CIFAR10(("test",)) test_stream = DataStream.default_stream( test, iteration_scheme=ShuffledScheme( test.num_examples, batch_size)) # ConvMLP Parameters image_size = (32, 32) num_channels = 3 num_conv = 3 # Number of Convolutional Layers if feature_maps is None: feature_maps = [20, 30, 30] if not len(feature_maps) == num_conv: raise ValueError('Must specify more feature maps') if conv_sizes is None: conv_sizes = [5] * num_conv if pool_sizes is None: pool_sizes = [2] * num_conv if mlp_hiddens is None: mlp_hiddens = [500] output_size = 10 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = ConvMLP(conv_activations, num_channels, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() for i in range(num_conv): convnet.layers[i].weights_init = Uniform(width=.2) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): logging.info("Layer {} dim: {} {} {}".format( i, *layer.get_dim('output'))) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = named_copy(CategoricalCrossEntropy().apply(y.flatten(), probs), 'cost') error_rate = named_copy(MisclassificationRate().apply(y.flatten(), probs), 'error_rate') cg = ComputationGraph([cost, error_rate]) # Apply Dropout to outputs of rectifiers from blocks.roles import OUTPUT vs = VariableFilter(roles=[OUTPUT])(cg.variables) vs1 = [v for v in vs if v.name.startswith('rectifier')] vs1 = vs1[0: -2] # Only first two layers cg = apply_dropout(cg, vs1, 0.5) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=AdaDelta()) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], test_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing()] model = Model(cost) main_loop = MainLoop( algorithm, train_stream, model=model, extensions=extensions) main_loop.run() classifier_fn = 'convmlp_cifar10.zip' with open(classifier_fn, 'w') as f: dump(convnet, f)