def test_adagrad(): a = shared_floatx([3, 4]) cost = (a**2).sum() steps, updates = AdaGrad().compute_steps( OrderedDict([(a, tensor.grad(cost, a))])) f = theano.function([], [steps[a]], updates=updates) rtol = 1e-4 assert_allclose(f()[0], [0.002, 0.002], rtol=rtol) a.set_value([2, 3]) assert_allclose(f()[0], [0.0011094, 0.0012], rtol=rtol) a.set_value([1, 1.5]) assert_allclose(f()[0], [0.00053452, 0.0005747], rtol=rtol)
def setup_algorithms(cost, cg, method, type="ff"): """Setup training algorithm. Parameters ---------- cost : expression cost expression cg : ComputationGraph Computation graph method : string training method: SGD, momentum SGD, AdaGrad, RMSprop learning_rate : float learning rate for learning method Returns ------- algorithm : GradientDescent Gradient Descent algorithm based on different optimization method """ if method == "sgd": step_rule = Scale(learning_rate=0.01) elif method == "momentum": step_rule = Momentum(learning_rate=0.01, momentum=0.95) elif method == "adagrad": step_rule = AdaGrad() elif method == "rmsprop": step_rule = RMSProp() if type == "RNN": step_rule = CompositeRule([StepClipping(1.0), step_rule]) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) return algorithm
def test_adagrad_broadcastable(): verify_broadcastable_handling(AdaGrad())
extra_updates = [] # Learning optimizer if training_optimizer == 'Adam': step_rules = [ Adam(learning_rate=learning_rate), StepClipping(step_clipping) ] # , VariableClipping(threshold=max_norm_threshold) elif training_optimizer == 'RMSProp': step_rules = [ RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping) ] elif training_optimizer == 'Adagrad': step_rules = [ AdaGrad(learning_rate=learning_rate), StepClipping(step_clipping) ] elif training_optimizer == 'Adadelta': step_rules = [AdaDelta(decay_rate=decay_rate), StepClipping(step_clipping)] parameters_to_update = cg.parameters algorithm = GradientDescent(cost=cg.outputs[0], parameters=parameters_to_update, step_rule=CompositeRule(step_rules)) algorithm.add_updates(extra_updates) # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, step_rules[0].learning_rate, gradient_norm, step_norm]
for p in weightList: reg2 += T.sum(p ** 2) cost += 0.00001 * reg2 n_epochs = 15 if "n_epochs" in config: n_epochs = int(config["n_epochs"]) params = cg.parameters model = Model([cost]) print "model parameters:" print model.get_parameter_dict() if "adagrad" in config: print "using adagrad" thisRule=AdaGrad(learning_rate=learning_rate) elif "adadelta" in config: print "using adadelta" thisRule=AdaDelta() elif "momentum" in config: print "using momentum" mWeight = float(config["momentum"]) thisRule=Momentum(learning_rate=learning_rate, momentum=mWeight) else: print "using traditional SGD" thisRule=Scale(learning_rate=learning_rate) if "gradientClipping" in config: threshold = float(config["gradientClipping"]) print "using gradient clipping with threshold ", threshold thisRule=CompositeRule([StepClipping(threshold), thisRule])
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float( config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([ AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) else: solver_type = CompositeRule([ RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) input_dim = {'l': 11427, 'r': 10519, 'b': 10519 + 11427} data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [ input for input in inputs if input.name.startswith('linear_') ] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent(step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream(dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([ dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm) ], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream(dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring(variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring(variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('AdniNet_{}'.format(side), channels=[ ['dropout_entropy', 'validation_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}net/{}'.format(side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.1, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop(data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
max_iter = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 256)) test_batch = int(config.get('hyperparams', 'valid_batch', 256)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) W_b = float(config.get('hyperparams', 'W_b', 0.01)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') if 'adagrad' in solver: solver_type = AdaGrad() else: solver_type = RMSProp(learning_rate=base_lr) pre_trained_folder = '../models/' input_dim = {'l': 11427, 'r': 10519, 'b': 10519 + 11427} train = H5PYDataset(data_file, which_set='train', sources=['l_features', 'r_features']) valid = H5PYDataset(data_file, which_set='valid', sources=['l_features', 'r_features']) test = H5PYDataset(data_file, which_set='test', sources=['l_features', 'r_features']) x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) # Define a feed-forward net with an input, two hidden layers, and a softmax output: autoencoder = MLP(activations=[
############ BLOCKS ######################## # wrap everything in Blocks objects and run! ######### training ################## n_epochs = 15 if "n_epochs" in config: n_epochs = int(config["n_epochs"]) print "number of training epochs: ", n_epochs model = Model([cost]) print "model parameters:" print model.get_parameter_dict() if "adagrad" in config: print "using adagrad" algorithm = GradientDescent(cost=cost, parameters=params, step_rule=AdaGrad(learning_rate=learning_rate), on_unused_sources='warn') elif "adadelta" in config: print "using adadelta" algorithm = GradientDescent(cost=cost, parameters=params, step_rule=AdaDelta(), on_unused_sources='warn') elif "momentum" in config: print "using momentum" mWeight = float(config["momentum"]) algorithm = GradientDescent(cost=cost, parameters=params, step_rule=Momentum(learning_rate=learning_rate, momentum=mWeight), on_unused_sources='warn')
def set_adagrad(self, learning_rate): self.step_rules.append(AdaGrad(learning_rate))
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout, depth, max_grad, step_method, epsilon, sample): #---------------------------------------------------------------------- datasource = name def shnum(x): """ Convert a positive float into a short tag-usable string E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2 """ return '0' if x <= 0 else '%s%d' % ( ("%e" % x)[0], -np.floor(np.log10(x))) jobname = "%s-%dX%dm%dd%dr%sb%de%s" % ( datasource, depth, dim, mix_dim, int( dropout * 10), shnum(learning_rate), batch_size, shnum(epsilon)) if max_length != 600: jobname += '-L%d' % max_length if GRU: jobname += 'g' if max_grad != 5.: jobname += 'G%g' % max_grad if step_method != 'adam': jobname += step_method if sample: print("Sampling") else: print("\nRunning experiment %s" % jobname) #---------------------------------------------------------------------- if depth > 1: transition = LSTMstack(dim=dim, depth=depth, name="transition", lstm_name="transition") assert not GRU elif GRU: transition = GatedRecurrent(dim=dim, name="transition") else: transition = LSTM(dim=dim, name="transition") emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter") readout = Readout(readout_dim=emitter.get_dim('inputs'), source_names=['states'], emitter=emitter, name="readout") normal_inputs = [ name for name in transition.apply.sequences if 'mask' not in name ] fork = Fork(normal_inputs, prototype=Linear(use_bias=True)) generator = SequenceGenerator(readout=readout, transition=transition, fork=fork) # Initialization settings generator.weights_init = OrthogonalGlorot() generator.biases_init = Constant(0) # Build the cost computation graph [steps,batch_size, 3] x = T.tensor3('features', dtype=floatX)[:max_length, :, :] x.tag.test_value = np.ones((max_length, batch_size, 3)).astype(np.float32) cost = generator.cost(x) cost.name = "sequence_log_likelihood" # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) model_size = 0 for v in params.itervalues(): s = v.get_value().shape model_size += s[0] * (s[1] if len(s) > 1 else 1) logger.info("Total number of parameters %d" % model_size) #------------------------------------------------------------ extensions = [] if old_model_name == 'continue': extensions.append(LoadFromDump(jobname)) elif old_model_name: # or you can just load the weights without state using: old_params = LoadFromDump(old_model_name).manager.load_parameters() model.set_param_values(old_params) else: # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() if sample: assert old_model_name and old_model_name != 'continue' Sample(generator, steps=max_length, path='.').do(None) exit(0) #------------------------------------------------------------ # Define the training algorithm. cg = ComputationGraph(cost) if dropout > 0.: from blocks.roles import INPUT, OUTPUT dropout_target = VariableFilter(roles=[OUTPUT], bricks=[transition], name_regex='states')(cg.variables) cg = apply_dropout(cg, dropout_target, dropout) cost = cg.outputs[0] if step_method == 'adam': step_rule = Adam(learning_rate) elif step_method == 'rmsprop': step_rule = RMSProp(learning_rate, decay_rate=0.95) elif step_method == 'adagrad': step_rule = AdaGrad(learning_rate) elif step_method == 'adadelta': step_rule = AdaDelta() elif step_method == 'scale': step_rule = Scale(learning_rate=0.1) else: raise Exception('Unknown sttep method %s' % step_method) step_rule = CompositeRule([StepClipping(max_grad), step_rule]) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) #------------------------------------------------------------ observables = [cost] # Fetch variables useful for debugging (energies, ) = VariableFilter(applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations, ) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy(abs(activations).mean(), "mean_activation") observables += [min_energy, max_energy, mean_activation] observables += [algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) #------------------------------------------------------------ datasource_fname = os.path.join(fuel.config.data_path, datasource, datasource + '.hdf5') train_ds = H5PYDataset( datasource_fname, #max_length=max_length, which_set='train', sources=('features', ), load_in_memory=True) train_stream = DataStream(train_ds, iteration_scheme=ShuffledScheme( train_ds.num_examples, batch_size)) test_ds = H5PYDataset( datasource_fname, #max_length=max_length, which_set='test', sources=('features', ), load_in_memory=True) test_stream = DataStream(test_ds, iteration_scheme=SequentialScheme( test_ds.num_examples, batch_size)) train_stream = Mapping(train_stream, _transpose) test_stream = Mapping(test_stream, _transpose) def stream_stats(ds, label): itr = ds.get_epoch_iterator(as_dict=True) batch_count = 0 examples_count = 0 for batch in itr: batch_count += 1 examples_count += batch['features'].shape[1] print('%s #batch %d #examples %d' % (label, batch_count, examples_count)) stream_stats(train_stream, 'train') stream_stats(test_stream, 'test') extensions += [ Timing(every_n_batches=10), TrainingDataMonitoring(observables, prefix="train", every_n_batches=10), DataStreamMonitoring( [cost], test_stream, prefix="test", on_resumption=True, after_epoch=False, # by default this is True every_n_batches=100), # all monitored data is ready so print it... # (next steps may take more time and we want to see the # results as soon as possible so print as soon as you can) Printing(every_n_batches=10), # perform multiple dumps at different intervals # so if one of them breaks (has nan) we can hopefully # find a model from few batches ago in the other Dump(jobname, every_n_batches=11), Dump(jobname + '.test', every_n_batches=100), Sample(generator, steps=max_length, path=jobname + '.test', every_n_batches=100), ProgressBar(), FinishAfter(after_n_epochs=epochs) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), ] if bokeh: extensions.append(Plot('sketch', channels=[ ['cost'], ])) # Construct the main loop and start training! main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def training(self, fea2obj, batch_size, learning_rate=0.005, steprule='adagrad', wait_epochs=5, kl_weight_init=None, klw_ep=50, klw_inc_rate=0, num_epochs=None): networkfile = self._config['net'] n_epochs = num_epochs or int(self._config['nepochs']) reg_weight = float(self._config['loss_weight']) reg_type = self._config['loss_reg'] numtrain = int( self._config['num_train']) if 'num_train' in self._config else None train_stream, num_samples_train = get_comb_stream( fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain) dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size=None, shuffle=False) logger.info('sources: %s -- number of train/dev samples: %d/%d', train_stream.sources, num_samples_train, num_samples_dev) t2idx = fea2obj['targets'].t2idx klw_init = kl_weight_init or float( self._config['kld_weight']) if 'kld_weight' in self._config else 1 logger.info('kl_weight_init: %d', klw_init) kl_weight = shared_floatx(klw_init, 'kl_weight') entropy_weight = shared_floatx(1., 'entropy_weight') cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate = build_model_new( fea2obj, len(t2idx), self._config, kl_weight, entropy_weight) cg = ComputationGraph(cost) weights = VariableFilter(roles=[WEIGHT])(cg.parameters) logger.info('Model weights are: %s', weights) if 'L2' in reg_type: cost += reg_weight * l2_norm(weights) logger.info('applying %s with weight: %f ', reg_type, reg_weight) dropout = -0.1 if dropout > 0: cg = apply_dropout(cg, weights, dropout) cost = cg.outputs[0] cost.name = 'cost' logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule, learning_rate) if 'adagrad' in steprule: cnf_step_rule = AdaGrad(learning_rate) elif 'adadelta' in steprule: cnf_step_rule = AdaDelta(decay_rate=0.95) elif 'decay' in steprule: cnf_step_rule = RMSProp(learning_rate=learning_rate, decay_rate=0.90) cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)]) elif 'momentum' in steprule: cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9) elif 'adam' in steprule: cnf_step_rule = Adam(learning_rate=learning_rate) else: logger.info('The steprule param is wrong! which is: %s', steprule) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=cnf_step_rule, on_unused_sources='warn') #algorithm.add_updates(updates) gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [ cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight, pat1_recog ] train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') dev_monitor = DataStreamMonitoring(variables=[ cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate ], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") extensions = [ dev_monitor, train_monitor, Timing(), TrackTheBest('dev_cost'), FinishIfNoImprovementAfter('dev_cost_best_so_far', epochs=wait_epochs), Printing(after_batch=False), #, ProgressBar() FinishAfter(after_n_epochs=n_epochs), saveload.Load(networkfile + '.toload.pkl'), ] + track_best('dev_cost', networkfile + '.best.pkl') #extensions.append(SharedVariableModifier(kl_weight, # lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) # extensions.append(SharedVariableModifier(entropy_weight, # lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) logger.info('number of parameters in the model: %d', tensor.sum([p.size for p in cg.parameters]).eval()) logger.info('Lookup table sizes: %s', [p.size.eval() for p in cg.parameters if 'lt' in p.name]) main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.005 * (W1**2).sum() + 0.005 * (W2**2).sum() cost.name = "loss" # # the actual training of the model # main = MainLoop( data_stream=DataStream.default_stream(dataset, iteration_scheme=SequentialScheme( dataset.num_instances, batch_size=512)), algorithm=GradientDescent(cost=cost, parameters=cg.parameters, step_rule=AdaGrad()), extensions=[ ProgressBar(), #FinishAfter(after_n_epochs=10), #Printing(), TrainingDataMonitoring(variables=[cost], after_batch=True), SaveWeights(layers=[W1, W2], prefixes=["./data/w1", "./data/w2"]), #VisualizeWordVectors(layers=[W1, W2], labels=dataset.word_dict), ]) main.run()
time2 = time.time() print "time for building the model: " + str(time2 - time1) ######### training ################## n_epochs = 15 if "n_epochs" in config: n_epochs = int(config["n_epochs"]) model = Model([cost]) print model.get_parameter_dict() curStepRule = Scale(learning_rate=learning_rate) if "adagrad" in config: print "using adagrad" curStepRule = AdaGrad(learning_rate=learning_rate) elif "adadelta" in config: print "using adadelta" curStepRule = AdaDelta() elif "momentum" in config: print "using momentum" mWeight = float(config["momentum"]) curStepRule = Momentum(learning_rate=learning_rate, momentum=mWeight) else: print "using traditional SGD" algorithm = GradientDescent(cost=cost, parameters=params, step_rule=curStepRule, on_unused_sources='warn') extensions = []
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) hidden_units = int(config.get('hyperparams', 'hidden_units', 16)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') fine_tune = config.getboolean('hyperparams', 'fine_tune') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([ AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) else: solver_type = CompositeRule([ RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13' ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45' right_dim = 10519 left_dim = 11427 train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') l_x = tensor.matrix('l_features') r_x = tensor.matrix('r_features') y = tensor.lmatrix('targets') lnet = load(ln_file).model.get_top_bricks()[0] rnet = load(rn_file).model.get_top_bricks()[0] # Pre-trained layers: # Inputs -> hidden_1 -> hidden 2 for side, net in zip(['l', 'r'], [lnet, rnet]): for child in net.children: child.name = side + '_' + child.name ll1 = lnet.children[0] lr1 = lnet.children[1] ll2 = lnet.children[2] lr2 = lnet.children[3] rl1 = rnet.children[0] rr1 = rnet.children[1] rl2 = rnet.children[2] rr2 = rnet.children[3] l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x)))) r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x)))) input_dim = ll2.output_dim + rl2.output_dim # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output output_mlp = MLP(activations=[ Rectifier(name='h3'), Rectifier(name='h4'), Softmax(name='output'), ], dims=[ input_dim, hidden_units, hidden_units, 2, ], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(std=W_sd, mean=W_mu)) output_mlp.initialize() # # Concatenate the inputs from the two hidden subnets into a single variable # # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) # y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [ input for input in inputs if input.name.startswith('linear_') ] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # If no fine-tuning of l-r models is wanted, find the params for only # the joint layers: if fine_tune: params_to_update = dropout_graph.parameters else: params_to_update = VariableFilter( [PARAMETER], bricks=output_mlp.children)(cost_graph) # Learning Algorithm: algo = GradientDescent(step_rule=solver_type, params=params_to_update, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream(dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([ dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm) ], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream(dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring(variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring(variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot( 'AdniNet_LeftRight', channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], ) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}'.format(stamp), save_separately=['model', 'log'], every_n_epochs=1) # The main loop will train the network and output reports, etc main_loop = MainLoop(data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), FinishIfNoImprovementAfter( notification_name='validation_error', epochs=1), Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def build_theano_functions(self) : # shape of theano inpu is time+1 X features x = T.fmatrix('frequency_sequence') x = x.reshape((self.batch_dim, self.time_dim+1, self.input_dim)) y = x[:,1:self.time_dim+1,:] x = x[:,:self.time_dim,:] layers_input = [x] dims =np.array([self.input_dim]) for dim in self.lstm_layers_dim : dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)) : # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear(dims[layer], dims[layer+1]*4, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear"+str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM( dim=dims[layer+1], weights_init=IsotropicGaussian(mean=0.,std=0.5), biases_init=Constant(1), name="lstm"+str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # the idea is to have one gaussian parametrize every frequency bin print "Last linear transform dim :", dims[1:].sum() output_transform = Linear(dims[1:].sum(), self.output_dim, #weights_init=IsotropicGaussian(mean=0., std=1), weights_init=Orthogonal(self.orth_scale), biases_init=Constant(0), #use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1 : print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else : y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2)) sig = T.nnet.relu(y_hat[:,:,:self.output_dim/2])+0.05 mus = y_hat[:,:,self.output_dim/2:] # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time inside_expo = -0.5*((y-mus)**2)/sig**2 expo = T.exp(inside_expo) coeff = 1./(T.sqrt(2.*np.pi)*sig) inside_log = T.log(coeff*expo) inside_log_max = T.max(inside_log, axis=2, keepdims=True) LL = -(inside_log_max + T.log(T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum() LL.name = "summed_likelihood" model = Model(LL) self.model = model algorithm = GradientDescent( cost=LL, parameters=model.parameters, step_rule=AdaGrad()) f = theano.function([x],[sig, mus]) return algorithm, f