def test_remove_not_finite(): rule1 = RemoveNotFinite() rule2 = RemoveNotFinite(1.) gradients = {1: shared_floatx(numpy.nan), 2: shared_floatx(numpy.inf)} rval1, _ = rule1.compute_steps(gradients) assert_allclose(rval1[1].eval(), 0.1) assert_allclose(rval1[2].eval(), 0.2) rval2, _ = rule2.compute_steps(gradients) assert_allclose(rval2[1].eval(), 1.0) assert_allclose(rval2[2].eval(), 2.0)
def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping if name == 'adam': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) adam = Adam(learning_rate=learning_rate) # [adam, clipping] means 'step clipping' # [clipping, adam] means 'gradient clipping' step_rule = CompositeRule([adam, clipping]) elif name == 'rms_prop': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) rms_prop = RMSProp(learning_rate=learning_rate) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, rms_prop, rm_non_finite]) else: clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, sgd_momentum, rm_non_finite]) return step_rule
def learning_algorithm(learning_rate, momentum=0.0, clipping_threshold=100, algorithm='sgd'): if algorithm == 'adam': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) adam = Adam(learning_rate=learning_rate) # [adam, clipping] means 'step clipping' # [clipping, adam] means 'gradient clipping' step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) rms_prop = RMSProp(learning_rate=learning_rate) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, rms_prop, rm_non_finite]) elif algorithm == 'sgd': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, sgd_momentum, rm_non_finite]) else: raise NotImplementedError return step_rule
def get_optimizer(self): optimizer_dict = self.config['optimizer'].copy() lr = optimizer_dict['learning_rate'] optimizer_param_dict = {k: optimizer_dict[k] for k \ in set(optimizer_dict.keys()) - set(['type', 'learning_rate'])} if len(optimizer_param_dict.keys()) > 0: optimizer = self.Optimizer(learning_rate=lr, **optimizer_param_dict) else: optimizer = self.Optimizer(lr) if dmconfig.floatX == 'float16': print "WARNING: float16 makes the training unstable, inserting RemoveNotFinite in optimizer" optimizer = CompositeRule([optimizer, RemoveNotFinite()]) return optimizer
print f(data['y'], data['x'], data['is_for_test'], data['drops']) if not os.path.exists(save_path): os.makedirs(save_path) log_path = save_path + '/log.txt' fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) print 'Bulding training process...' model = Model(cost) params = ComputationGraph(cost).parameters print_num_params(params) clipping = StepClipping(threshold=np.cast[floatX](1.0)) # Momentum(learning_rate=args.learning_rate, momentum=0.9) rm_non_finite = RemoveNotFinite() rms_prop = RMSProp(learning_rate=1e-3, decay_rate=0.5) step_rule = CompositeRule([clipping, rms_prop, rm_non_finite]) algorithm = GradientDescent( cost=cost, parameters=params, step_rule=step_rule) # train_stream, valid_stream = get_seq_mnist_streams( # h_dim, batch_size, update_prob) train_stream = get_stream('train', batch_size, h_dim, False) train_stream_evaluation = get_stream('train', batch_size, h_dim, True) valid_stream = get_stream('valid', batch_size, h_dim, True) if load_path: with open(load_path + '/trained_params_best.npz') as f:
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph(batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter(applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append(rename(gain_matrix.min(), 'min_gain')) primary_observables.append(rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter(applications=[r.generator.readout.readout], name="output_0")(cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")(cost_cg)[-1] attended, = VariableFilter(applications=[r.generator.transition.apply], name="attended")(cost_cg) attended_mask, = VariableFilter(applications=[ r.generator.transition.apply ], name="attended_mask")(cost_cg) weights, = VariableFilter(applications=[r.generator.evaluate], name="weights")(cost_cg) from blocks.roles import AUXILIARY l2_cost, = VariableFilter(roles=[AUXILIARY], theano_name='l2_cost_aux')(cost_cg) cost_forward, = VariableFilter(roles=[AUXILIARY], theano_name='costs_forward_aux')(cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename( abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density ]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [ p for p in cg.parameters if p not in attention_params ] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = ( train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model( regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise')) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1] ] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append( AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [ v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable) ] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat( [name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([ name for name, p in parameters.items() if not p in maxnorm_subjects ])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects) ] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold ] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements**0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5 step_norm = algorithm.steps[param].norm(2) / num_elements**0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length ] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty ] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append( rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append( rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append( Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append( TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables + [l2_cost, cost_forward]), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions(before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append( AdaptiveClipping(algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')).add_condition( ["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [ average_monitoring.record_name(train_cost), validation.record_name(cost) ], # Plot 2: gradient norm, [ average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold) ], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [ average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label') ], # Plot 5: training and validation monotonicity penalty [ average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording') ] ] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server), ] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True).add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension, )).add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension, )), ProgressBar() ] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains(labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name ] extensions.append(Patience(**patience_conf)) extensions.append( Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def initialaze_algorithm(config, save_path, bokeh_name, params, bokeh_server, bokeh, use_load_ext, load_log, fast_start, recognizer, data, model, cg, regularized_cg, cost, train_cost, parameters, max_norm_rules, observables, batch_size, batch_cost, weights_entropy, labels_mask, labels, gradients=None): primary_observables = observables secondary_observables = [] validation_observables = [] root_path, extension = os.path.splitext(save_path) train_conf = config['training'] # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) if 'adam' in rule_names: assert len(rule_names) == 1 logger.info("Using Adam for training") core_rules.append( Adam(learning_rate=train_conf.get('scale', 0.002), beta1=train_conf.get('beta1', 0.1), beta2=train_conf.get('beta2', 0.001), epsilon=train_conf.get('epsilon', 1e-8), decay_factor=train_conf.get('decay_rate', (1 - 1e-8)))) burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append( BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') #theano_func_kwargs={'mode':NanGuardMode(nan_is_error=True)}) logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances')] + weights_entropy def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name.startswith('weights_entropy'): chld_id = recognizer.child_id_from_postfix(var.name) result.append(rename(aggregation.mean(var, labels_mask[chld_id].sum()), 'weights_entropy_per_label'+ recognizer.children[chld_id].names_postfix)) elif var.name.endswith('_nll'): chld_id = recognizer.child_id_from_postfix(var.name) result.append(rename(aggregation.mean(var.sum(), labels_mask[chld_id].sum()), var.name+'_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False, **data_params_valid), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) additional_patience_notifiers = [] uas = DependencyErrorRate(recognizer.children[0], data, **config['monitoring']['search']) las = AuxiliaryErrorRates(uas, name='LAS') lab = AuxiliaryErrorRates(uas, name='LAB') per_monitoring = DataStreamMonitoring( [uas, las, lab], data.get_one_stream("valid", data.langs[0], batches=False, shuffle=False, **data_params_valid)[0], prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_uas = TrackTheBest( per_monitoring.record_name(uas)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_las = TrackTheBest( per_monitoring.record_name(las)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_lab = TrackTheBest( per_monitoring.record_name(lab)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_uas, track_the_best_las, track_the_best_lab, ] per = uas track_the_best_per = track_the_best_uas additional_patience_notifiers = [track_the_best_lab, track_the_best_las] track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500, num_stds=train_conf.get('clip_stds', 1.0))) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']), # .add_condition(["after_batch"], _gradient_norm_is_none), ] main_postfix = recognizer.children[0].names_postfix channels = [ # Plot 1: training and validation costs [average_monitoring.record_name(train_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'+main_postfix), validation._record_name('weights_entropy_per_label'+main_postfix)], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'+main_postfix), validation._record_name('weights_penalty_per_recording'+main_postfix)]] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server),] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar()] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name] + additional_patience_notifiers extensions.append(Patience(**patience_conf)) if train_conf.get('min_performance_stops'): extensions.append(EarlyTermination( param_name=track_the_best_per.best_name, min_performance_by_epoch=train_conf['min_performance_stops'])) extensions.append(Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def main(config, tr_stream, dev_stream): # Create Theano variables source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) # Initialize model encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector( decoder.transition.initial_transformer).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) cost = cg.outputs[0] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_params(), Selector(decoder).get_params()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.iteritems(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training algorithm if args.subtensor_fix: assert config['step_rule'] == 'AdaDelta' from subtensor_gradient import GradientDescent_SubtensorFix, AdaDelta_SubtensorFix, subtensor_params lookups = subtensor_params(cg, [ encoder.lookup, decoder.sequence_generator.readout.feedback_brick.lookup ]) algorithm = GradientDescent_SubtensorFix( subtensor_params=lookups, cost=cost, params=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), RemoveNotFinite(0.9), AdaDelta_SubtensorFix(subtensor_params=lookups) ])) else: algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), RemoveNotFinite(0.9), eval(config['step_rule'])() ])) # Set up beam search and sampling computation graphs sampling_representation = encoder.apply(sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) samples, = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[1])) # generated[1] is the next_outputs # Set up training model training_model = Model(cost) # Set extensions extensions = [ Sampler(model=search_model, config=config, data_stream=tr_stream, src_eos_idx=config['src_eos_idx'], trg_eos_idx=config['trg_eos_idx'], every_n_batches=config['sampling_freq']), BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_eos_idx=config['src_eos_idx'], trg_eos_idx=config['trg_eos_idx'], every_n_batches=config['bleu_val_freq']), TrainingDataMonitoring([cost], after_batch=True), #Plot('En-Fr', channels=[['decoder_cost_cost']], # after_batch=True), Printing(after_batch=True), Dump(config['saveto'], every_n_batches=config['save_freq']) ] # Reload model if necessary if config['reload']: extensions += [LoadFromDumpWMT15(config['saveto'])] # Initialize main loop main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
# How often (number of batches) to print / plot monitor_freq = 20 batch_size = 200 # regularization : noise on the weights weight_noise = 0.01 dropout = 0.2 # number of classes, a constant of the dataset num_output_classes = 5 # the step rule (uncomment your favorite choice) step_rule = CompositeRule([AdaDelta(), RemoveNotFinite()]) #step_rule = CompositeRule([Momentum(learning_rate=0.00001, momentum=0.99), RemoveNotFinite()]) #step_rule = CompositeRule([Momentum(learning_rate=0.1, momentum=0.9), RemoveNotFinite()]) #step_rule = CompositeRule([AdaDelta(), Scale(0.01), RemoveNotFinite()]) #step_rule = CompositeRule([RMSProp(learning_rate=0.1, decay_rate=0.95), # RemoveNotFinite()]) #step_rule = CompositeRule([RMSProp(learning_rate=0.0001, decay_rate=0.95), # BasicMomentum(momentum=0.9), # RemoveNotFinite()]) # How the weights are initialized weights_init = IsotropicGaussian(0.01) biases_init = Constant(0.001) # ==========================================================================================
def main(name, epochs, batch_size, learning_rate): if name is None: name = "att-rw" print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print() #------------------------------------------------------------------------ img_height, img_width = 28, 28 read_N = 12 write_N = 14 inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } x_dim = img_height * img_width reader = ZoomableAttentionWindow(img_height, img_width, read_N) writer = ZoomableAttentionWindow(img_height, img_width, write_N) # Parameterize the attention reader and writer mlpr = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="RMLP", **inits) mlpw = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="WMLP", **inits) # MLP between the reader and writer mlp = MLP(activations=[Tanh(), Identity()], dims=[read_N**2, 300, write_N**2], name="MLP", **inits) for brick in [mlpr, mlpw, mlp]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') hr = mlpr.apply(x) hw = mlpw.apply(x) center_y, center_x, delta, sigma, gamma = reader.nn2att(hr) r = reader.read(x, center_y, center_x, delta, sigma) h = mlp.apply(r) center_y, center_x, delta, sigma, gamma = writer.nn2att(hw) c = writer.write(h, center_y, center_x, delta, sigma) / gamma x_recons = T.nnet.sigmoid(c) cost = BinaryCrossEntropy().apply(x, x_recons) cost.name = "cost" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ RemoveNotFinite(), Adam(learning_rate), StepClipping(3.), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]: # v_mean = v.mean() # v_mean.name = v.name # monitors += [v_mean] # monitors += [aggregation.mean(v)] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["cost"], ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX( DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX( DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), prefix="test"), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name + ".pkl"), #Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
extra_updates = [] if tbptt_flag: for name, var in states.items(): update = T.switch( start_flag, 0. * var, VariableFilter(theano_name_regex=regex_final_value(name))( cg.auxiliary_variables)[0]) extra_updates.append((var, update)) algorithm = GradientDescent( cost=reg_cost, parameters=parameters, step_rule=CompositeRule([StepClipping(10.), Adam(lr), RemoveNotFinite()])) algorithm.add_updates(extra_updates) mean_data = x.mean(axis=(0, 1)).copy(name="data_mean") sigma_data = x.std(axis=(0, 1)).copy(name="data_std") max_data = x.max(axis=(0, 1)).copy(name="data_max") min_data = x.min(axis=(0, 1)).copy(name="data_min") variables = [lr, reg_cost, cost, mean_data, sigma_data, max_data, min_data ] + monitoring_vars train_monitor = TrainingDataMonitoring( variables=variables + [algorithm.total_step_norm, algorithm.total_gradient_norm], every_n_batches=n_batches, prefix="train")
def main(config, tr_stream, dev_stream, use_bokeh=False): logger.info('Building RNN encoder-decoder') cost, samples, search_model = create_model(config) #cost, samples, search_model = create_multitask_model(config) logger.info("Building model") cg = ComputationGraph(cost) training_model = Model(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, src_vocab=config['src_vocab'], trg_vocab=config['trg_vocab'], phones_vocab=config['phones'], hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on f1 if config['f1_validation'] is not None: logger.info("Building f1 validator") extensions.append( F1Validator(samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_f1'], every_n_batches=config['f1_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])(), RemoveNotFinite() ]), on_unused_sources='warn') # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
features = T.matrix('features', dtype=theano.config.floatX) cost = dpm.cost(features) blocks_model = Model(cost) cg_nodropout = ComputationGraph(cost) if args.dropout_rate > 0: # DEBUG this triggers an error on my machine # apply dropout to all the input variables inputs = VariableFilter(roles=[INPUT])(cg_nodropout.variables) # dropconnect # inputs = VariableFilter(roles=[PARAMETER])(cg_nodropout.variables) cg = apply_dropout(cg_nodropout, inputs, args.dropout_rate) else: cg = cg_nodropout step_compute = RMSProp(learning_rate=args.lr, max_scaling=1e10) algorithm = GradientDescent(step_rule=CompositeRule( [RemoveNotFinite(), step_compute]), params=cg.parameters, cost=cost) extension_list = [] extension_list.append( SharedVariableModifier( step_compute.learning_rate, extensions.decay_learning_rate, after_batch=False, every_n_epochs=1, )) extension_list.append(FinishAfter(after_n_epochs=100001)) ## set up logging extension_list.extend([Timing(), Printing()]) model_dir = util.create_log_dir(args, dpm.name + '_' + args.dataset)
def train(self, cost, y_hat, train_stream, accuracy=None, prediction_cost=None, regularization_cost=None, params_to_optimize=None, valid_stream=None, extra_extensions=None, model=None, vars_to_monitor_on_train=None, vars_to_monitor_on_valid=None, step_rule=None, additional_streams=None, save_on_best=None, use_own_validation=False, objects_to_dump=None): """ Generic method for training models. It extends functionality already provided by Blocks. :param cost: Theano var with cost function :param y_hat: Theano var with predictions from the model :param train_stream: Fuel stream with training data :param accuracy: Theano var with accuracy :param prediction_cost: :param regularization_cost: :param params_to_optimize: :param valid_stream: Fuel stream with validation data :param extra_extensions: :param model: :param vars_to_monitor_on_train: :param vars_to_monitor_on_valid: :param step_rule: :param additional_streams: :param save_on_best: :param use_own_validation: :param objects_to_dump: :return: """ if not vars_to_monitor_on_valid: vars_to_monitor_on_valid = [(cost, min)] if accuracy: vars_to_monitor_on_valid.append((accuracy, max)) if not save_on_best: # use default metrics for saving the best model save_on_best = [(cost, min)] if accuracy: save_on_best.append((accuracy, max)) # setup the training algorithm ####################################### # step_rule = Scale(learning_rate=0.01) # step_rule = Adam() model_save_suffix = "" if self.args.append_metaparams: model_save_suffix = "." + get_current_metaparams_str( self.parser, self.args) # get a list of variables that will be monitored during training vars_to_monitor = [cost] if accuracy: vars_to_monitor.append(accuracy) if prediction_cost: vars_to_monitor.append(prediction_cost) if regularization_cost: vars_to_monitor.append(regularization_cost) theano_vars_to_monitor = [ var for var, comparator in vars_to_monitor_on_valid ] if not params_to_optimize: # use all parameters of the model for optimization cg = ComputationGraph(cost) params_to_optimize = cg.parameters self.print_parameters_info(params_to_optimize) if not model: if accuracy: model = MultiOutputModel([cost, accuracy, y_hat] + theano_vars_to_monitor) else: model = MultiOutputModel([cost, y_hat] + theano_vars_to_monitor) if not step_rule: step_rule = AdaDelta() # learning_rate=0.02, momentum=0.9) step_rules = [ StepClipping(self.args.gradient_clip), step_rule, RemoveNotFinite() ] # optionally add gradient noise if self.args.gradient_noise: step_rules = [ GradientNoise(self.args.gradient_noise, self.args.gn_decay) ] + step_rules algorithm = GradientDescent(cost=cost, parameters=params_to_optimize, step_rule=CompositeRule(step_rules), on_unused_sources="warn") # this variable aggregates all extensions executed periodically during training extensions = [] if self.args.epochs_max: # finis training after fixed number of epochs extensions.append(FinishAfter(after_n_epochs=self.args.epochs_max)) # training data monitoring def create_training_data_monitoring(): if "every_n_epochs" in self.args.evaluate_every_n: return TrainingDataMonitoring(vars_to_monitor, prefix='train', after_epoch=True) else: return TrainingDataMonitoring(vars_to_monitor, prefix='train', after_epoch=True, **self.args.evaluate_every_n) # add extensions that monitors progress of training on train set extensions.extend([create_training_data_monitoring()]) if not self.args.disable_progress_bar: extensions.append(ProgressBar()) def add_data_stream_monitor(data_stream, prefix): if not use_own_validation: extensions.append( DataStreamMonitoring(variables=theano_vars_to_monitor, data_stream=data_stream, prefix=prefix, before_epoch=False, **self.args.evaluate_every_n)) # additional streams that should be monitored if additional_streams: for stream_name, stream in additional_streams: add_data_stream_monitor(stream, stream_name) # extra extensions need to be called before Printing extension if extra_extensions: extensions.extend(extra_extensions) if valid_stream: # add validation set monitoring add_data_stream_monitor(valid_stream, 'valid') # add best val monitoring for var, comparator in vars_to_monitor_on_valid: extensions.append( TrackTheBest("valid_" + var.name, choose_best=comparator, **self.args.evaluate_every_n)) if self.args.patience_metric == 'cost': patience_metric_name = cost.name elif self.args.patience_metric == 'accuracy': patience_metric_name = accuracy.name else: print "WARNING: Falling back to COST function for patience." patience_metric_name = cost.name extensions.append( # "valid_cost_best_so_far" message will be entered to the main loop log by TrackTheBest extension FinishIfNoImprovementAfter( "valid_" + patience_metric_name + "_best_so_far", epochs=self.args.epochs_patience_valid)) if not self.args.do_not_save: # use user provided metrics for saving valid_save_extensions = map( lambda metric_comparator: SaveTheBest( "valid_" + metric_comparator[0].name, self.args.save_path + ".best." + metric_comparator[ 0].name + model_save_suffix, choose_best=metric_comparator[1], **self.args.evaluate_every_n), save_on_best) extensions.extend(valid_save_extensions) extensions.extend([ Timing(**self.args.evaluate_every_n), Printing(after_epoch=False, **self.args.evaluate_every_n), ]) if not self.args.do_not_save or self.args.save_only_best: extensions.append( Checkpoint(self.args.save_path + model_save_suffix, **self.args.save_every_n)) extensions.append(FlushStreams(**self.args.evaluate_every_n)) # main loop ########################################################## main_loop = MainLoop(data_stream=train_stream, model=model, algorithm=algorithm, extensions=extensions) sys.setrecursionlimit(1000000) main_loop.run()
def train_rnnrbm(train, rnnrbm, epochs=1000, test=None, bokeh=True, load_path=None): cdk = theano.shared(10) lr = theano.shared(float32(0.004)) cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk) error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask) error_rate.name = "error on note as a whole" mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask) mistake_rate.name = "single error within note" cost.name = 'rbm_cost' model = Model(cost) cg = ComputationGraph([cost]) step_rule = CompositeRule([ RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0), RemoveNotFinite() ]) # Scale(0.01) gradients = dict( equizip(cg.parameters, T.grad(cost, cg.parameters, consider_constant=[v_sample]))) algorithm = GradientDescent(step_rule=step_rule, gradients=gradients, cost=cost, params=cg.parameters) algorithm.add_updates(cg.updates) extensions = [ SharedVariableModifier(parameter=cdk, function=lambda n, v: rnnrbm_cdk[n] if rnnrbm_cdk.get(n) else v), SharedVariableModifier(parameter=lr, function=lambda n, v: float32(0.78 * v) if n % (200 * 5) == 0 else v), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( [ cost, error_rate, mistake_rate, ], # hidden_states, debug_val, param_nans, # aggregation.mean(algorithm.total_gradient_norm)], #+ params, prefix="train", after_epoch=False, every_n_batches=40), Timing(), Printing(), ProgressBar() ] if test is not None: extensions.append( DataStreamMonitoring([cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test", after_epoch=False, every_n_batches=40)) if bokeh: extensions.append( Plot( 'Training RNN-RBM', channels=[ [ 'train_error on note as a whole', 'train_single error within note', 'test_error on note as a whole', 'test_single error within note' ], ['train_final_cost'], # ['train_total_gradient_norm'], ])) main_loop = MainLoop(algorithm=algorithm, data_stream=train, model=model, extensions=extensions) return main_loop
parameters_size += reduce(operator.mul, value.get_value().shape, 1) logger.info('Total number of parameters: %d in %d matrices' % (parameters_size, len(cg.parameters))) if hasattr(config, 'step_rule'): step_rule = config.step_rule else: step_rule = AdaDelta() logger.info("Fuel seed: %d" % fuel.config.default_seed) logger.info("Blocks seed: %d" % blocks.config.default_seed) params = cg.parameters algorithm = GradientDescent( cost=cost, step_rule=CompositeRule([ RemoveNotFinite(), step_rule ]), parameters=params) plot_vars = [['valid_' + x.name for x in valid_monitored] + ['train_' + x.name for x in valid_monitored]] logger.info('Plotted variables: %s' % str(plot_vars)) dump_path = os.path.join('model_data', model_name) + '.pkl' logger.info('Dump path: %s' % dump_path) if hasattr(config, 'monitor_freq'): monitor_freq = config.monitor_freq else: monitor_freq = 10000
def pretrain_rnn(train, rnnrbm, test=None, epochs=1000, bokeh=True): lr = theano.shared(float32(0.1)) probs, _, _, _ = rnnrbm.rnn_pretrain_pred(x, x_mask) cost = NegativeLogLikelihood().apply(y, probs, y_mask) error_rate = MismulitclassificationRate().apply(y, probs, y_mask) error_rate.name = "error on note as a whole" mistake_rate = MismulitmistakeRate().apply(y, probs, y_mask) mistake_rate.name = "single error within note" cost.name = 'final_cost' model = Model(cost) cg = ComputationGraph([cost]) step_rule = CompositeRule([ RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0), RemoveNotFinite() ]) algorithm = GradientDescent(step_rule=step_rule, cost=cost, params=cg.parameters) extensions = [ SharedVariableModifier(parameter=lr, function=lambda n, v: float32(0.7 * v) if n % 700 == 0 else v), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( [ cost, error_rate, mistake_rate, ], # hidden_states, debug_val, param_nans, # aggregation.mean(algorithm.total_gradient_norm)], #+ params, prefix="train", after_epoch=False, every_n_batches=40), Timing(), Printing(), ProgressBar() ] if test is not None: extensions.append( DataStreamMonitoring([cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test", after_epoch=False, every_n_batches=40)) if bokeh: extensions.append( Plot( 'Pretrain RNN', channels=[ [ 'train_error on note as a whole', 'train_single error within note', 'test_error on note as a whole', 'test_single error within note' ], ['train_rbm_cost'], # ['train_total_gradient_norm'], ])) main_loop = MainLoop(algorithm=algorithm, data_stream=train, model=model, extensions=extensions) return main_loop
parameters_size += reduce(operator.mul, value.get_value().shape, 1) logger.info('Total number of parameters: %d in %d matrices' % (parameters_size, len(cg.parameters))) if hasattr(config, 'step_rule'): step_rule = config.step_rule else: step_rule = AdaDelta() logger.info("Fuel seed: %d" % fuel.config.default_seed) logger.info("Blocks seed: %d" % blocks.config.default_seed) params = cg.parameters algorithm = GradientDescent(cost=cost, step_rule=CompositeRule( [RemoveNotFinite(), step_rule]), parameters=params) plot_vars = [['valid_' + x.name for x in valid_monitored] + ['train_' + x.name for x in valid_monitored]] logger.info('Plotted variables: %s' % str(plot_vars)) dump_path = os.path.join('model_data', model_name) + '.pkl' logger.info('Dump path: %s' % dump_path) extensions = [ TrainingDataMonitoring(monitored, prefix='train', every_n_batches=1000), DataStreamMonitoring(valid_monitored, valid_stream, prefix='valid',
## set up optimization features = T.matrix('features', dtype=theano.config.floatX) cost = dpm.cost(features) blocks_model = blocks.model.Model(cost) cg_nodropout = ComputationGraph(cost) if args.dropout_rate > 0: # DEBUG this triggers an error on my machine # apply dropout to all the input variables inputs = VariableFilter(roles=[INPUT])(cg_nodropout.variables) # dropconnect # inputs = VariableFilter(roles=[PARAMETER])(cg_nodropout.variables) cg = apply_dropout(cg_nodropout, inputs, args.dropout_rate) else: cg = cg_nodropout step_compute = RMSProp(learning_rate=args.lr, max_scaling=1e10) algorithm = GradientDescent(step_rule=CompositeRule([RemoveNotFinite(), step_compute]), parameters=cg.parameters, cost=cost) extension_list = [] extension_list.append( SharedVariableModifier(step_compute.learning_rate, extensions.decay_learning_rate, after_batch=False, every_n_batches=batches_per_epoch, )) extension_list.append(FinishAfter(after_n_epochs=100001)) ## logging of test set performance extension_list.append(extensions.LogLikelihood(dpm, test_stream, scl, every_n_batches=args.ext_every_n*batches_per_epoch, before_training=False)) ## set up logging
def test_remove_not_finite_broadcastable(): verify_broadcastable_handling(RemoveNotFinite()) verify_broadcastable_handling(RemoveNotFinite(0.1))
def train_net(net, train_stream, test_stream, L1=None, L2=None, early_stopping=False, finish=None, dropout=False, jobid=None, update=None, duration=None, **ignored): x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') y_hat = net.apply(x) #Cost cost_before = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost_before.name = "cost_without_regularization" #Error #Taken from brodesf error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = "Misclassification rate" #Regularization cg = ComputationGraph(cost_before) WS = VariableFilter(roles=[WEIGHT])(cg.variables) if dropout: print("Dropout") cg = apply_dropout(cg, WS, 0.5) if L1: print("L1 with lambda ", L1) L1_reg = L1 * sum([abs(W).sum() for W in WS]) L1_reg.name = "L1 regularization" cost_before += L1_reg if L2: print("L2 with lambda ", L2) L2_reg = L2 * sum([(W**2).sum() for W in WS]) L2_reg.name = "L2 regularization" cost_before += L2_reg cost = cost_before cost.name = 'cost_with_regularization' #Initialization print("Initilization") net.initialize() #Algorithm step_rule = Scale(learning_rate=0.1) if update is not None: if update == "rmsprop": print("Using RMSProp") step_rule = RMSProp() remove_not_finite = RemoveNotFinite(0.9) step_rule = CompositeRule([step_rule, remove_not_finite]) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) print("Extensions") extensions = [] #Monitoring monitor = DataStreamMonitoring(variables=[cost, error], data_stream=test_stream, prefix="test") extensions.append(monitor) def filename(suffix=""): prefix = jobid if jobid else str(os.getpid()) ctime = str(time.time()) return "checkpoints/" + prefix + "_" + ctime + "_" + suffix + ".zip" #Serialization #serialization = Checkpoint(filename()) #extensions.append(serialization) notification = "test_" + error.name track = TrackTheBest(notification) best_notification = track.notification_name checkpointbest = SaveBest(best_notification, filename("best")) extensions.extend([track, checkpointbest]) if early_stopping: print("Early stopping") stopper = FinishIfNoImprovementAfterPlus(best_notification) extensions.append(stopper) #Other extensions if finish != None: print("Force finish ", finish) extensions.append(FinishAfter(after_n_epochs=finish)) if duration != None: print("Stop after ", duration, " seconds") extensions.append(FinishAfterTime(duration)) extensions.extend([Timing(), Printing()]) #Main loop main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions) print("Main loop start") main_loop.run()