def initialize_graph(recognizer, data, config, params): # Separate attention_params to be handled differently # when regularization is applied attentions = recognizer.all_children().generator.transition.attention.get() attention_params = [Selector(attention).get_parameters().values() for attention in attentions] logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) observables = [] # monitored each batch cg = recognizer.get_cost_graph(batch=True) labels = [] labels_mask = [] for chld in recognizer.children: lbls = VariableFilter(applications=[chld.cost], name='labels'+chld.names_postfix)(cg) lbls_mask = VariableFilter(applications=[chld.cost], name='labels_mask'+chld.names_postfix)(cg) if len(lbls) == 1: labels += lbls labels_mask += lbls_mask batch_cost = cg.outputs[0].sum() batch_size = rename(labels[0].shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=recognizer.all_children().bottom.apply.get(), name_regex="output")( cost_cg) attended = VariableFilter( applications=recognizer.all_children().generator.transition.apply.get(), name="attended")( cost_cg) attended_mask = VariableFilter( applications=recognizer.all_children().generator.transition.apply.get(), name="attended_mask")( cost_cg) weights = VariableFilter( applications=recognizer.all_children().generator.evaluate.get(), name="weights")( cost_cg) def get_renamed_list(rlist, elem_func, elem_name): return [rename(elem_func(elem), elem_name+chld.names_postfix) for elem,chld in zip(rlist, recognizer.children)] max_sentence_lengths = get_renamed_list(bottom_output, lambda e: e.shape[0], "max_sentence_length") max_attended_mask_lengths = get_renamed_list(attended_mask, lambda e: e.shape[0], "max_attended_mask_length") max_attended_lengths = get_renamed_list(attended, lambda e: e.shape[0], "max_attended_length") max_num_characters = get_renamed_list(labels, lambda e: e.shape[0], "max_num_characters") mean_attended = get_renamed_list(attended, lambda e: abs(e).mean(), "mean_attended") mean_bottom_output = get_renamed_list(bottom_output, lambda e: abs(e).mean(), "mean_bottom_output") mask_density = get_renamed_list(labels_mask, lambda e: e.mean(), "mask_density") weights_entropy = [rename(entropy(w, lm), "weights_entropy"+chld.names_postfix) for w, lm, chld in zip(weights, labels_mask, recognizer.children)] observables += max_attended_lengths + max_attended_mask_lengths + max_sentence_lengths # # Monitoring of cost terms is tricky because of Blocks #514 - since the # costs are annotations that are not part of the original output graph, # they are unaffected by replacements such as dropout!! # cost_terms = [] for chld in recognizer.children: chld_cost_terms = VariableFilter(applications=[chld.generator.evaluate], name_regex='.*_nll')(cost_cg) chld_cost_terms = [rename(var, var.name[:-4] + chld.names_postfix + '_nll') for var in chld_cost_terms] cost_terms += chld_cost_terms cg = ComputationGraph([cost, batch_size] + weights_entropy + mean_attended + mean_bottom_output + max_num_characters + mask_density + cost_terms) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): drop_conf = reg_config['dropout'] bot_drop = drop_conf.get('bottom', 0.0) if bot_drop: logger.info('apply bottom dropout') regularized_cg = apply_dropout(regularized_cg, bottom_output, bot_drop) enc_drop = drop_conf.get('encoder', 0.0) if enc_drop: logger.info('apply encoder dropout') enc_bricks = reduce(lambda acc,x: acc+list(x), recognizer.all_children().encoder.children.get(), []) enc_states = VariableFilter(bricks=enc_bricks, name_regex='states')(regularized_cg) regularized_cg = apply_dropout(regularized_cg, enc_states, enc_drop) post_merge_drop = drop_conf.get('post_merge', 0.0) if post_merge_drop: logger.info('apply post_merge dropout') pm_bricks = [] for chld in recognizer.children: cpm_bricks = list(chld.generator.readout.post_merge.children) cpm_bricks += cpm_bricks[-1].children cpm_bricks = [b for b in cpm_bricks if isinstance(b, type(chld.post_merge_activation))] pm_bricks += cpm_bricks regularized_cg = apply_dropout( regularized_cg, VariableFilter(bricks=pm_bricks, name='output')(regularized_cg), post_merge_drop) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = (train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2) train_cost = train_cost.copy(name='train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=SpeechModel(regularized_cg.outputs[0] ).get_parameter_dict().values(), **reg_config.get('adaptive_noise') ) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1]] # model prior variance if len(cost_terms): # Please note - the aggragation (mean) is done in # "attach_aggregation_schemes" ct_names = [v.name for v in cost_terms] for v in regularized_cg.outputs: if v.name in ct_names: observables.append(rename(v.sum()/batch_size, v.name)) for chld in recognizer.children: if chld.train_tags: tags_cost = VariableFilter(applications=[chld.addTagCost], name='output')(regularized_cg)[0] observables += [rename(tags_cost.sum()/batch_size, 'tags_nll'+chld.names_postfix)] # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise param_values = load_parameter_values(params) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] return { 'observables': observables, 'max_norm_rules': max_norm_rules, 'cg': cg, 'regularized_cg' : regularized_cg, 'train_cost' : train_cost, 'cost' : cost, 'batch_size' : batch_size, 'batch_cost' : batch_cost, 'parameters' : parameters, 'gradients': gradients, 'model' : model, 'data' : data, 'recognizer' : recognizer, 'weights_entropy' : weights_entropy, 'labels_mask' : labels_mask, 'labels' : labels }
print lord zaza = prob_function([lord], numpy.ones((1, len(lord)), dtype="int8"))[:, 0, :] print zaza print zaza.shape for (ey, row) in enumerate(zaza): print "PREDICTION PROBABILITIES FOR POSITION", ey, "LETTER", repr(lord_original[ey]) sorted_thing = [(prob, ind) for (ind, prob) in enumerate(row)] sorted_thing.sort(reverse=True) for (prob, ind) in sorted_thing: print repr(map_ind_2_chr[ind]), ":", prob print "\n" """ # define a function that gets the overall "sum of scores" at a given time step readouts = VariableFilter(theano_name="readout_readout_output_0")(lstm_net.cost_model.variables)[0] score_function = function([lstm_net.x, lstm_net.mask], readouts.sum(axis=2)) # this section of the playground has some fun rides that revolve around various correlation stuff. uncomment to access # =) sc = StateComputer(lstm_net.cost_model, map_chr_2_ind) # storage for the correlations at the very end correlation_dict = dict() for name in sc.state_var_names: correlation_dict[name] = numpy.zeros(lstm_net.hidden_dims[0], dtype=float) # get validation data to run over valid_data = H5PYDataset("bible.hdf5", which_sets=("valid",), load_in_memory=True) data_stream = PadAndAddMasks( DataStream.default_stream(dataset=valid_data, iteration_scheme=SequentialScheme(valid_data.num_examples, batch_size=128)), produces_examples=False)