def __init__(self, outputs): super(Model, self).__init__(outputs) if len(self.outputs) > 1: logger.warning("model with multiple output " + multiple_message) bricks = [ get_brick(var) for var in self.variables + self.scan_variables if get_brick(var) ] children = set(chain(*(brick.children for brick in bricks))) # Quadratic complexity: we should not have thousands of # top-level bricks. self.top_bricks = [] for brick in bricks: if brick not in children and brick not in self.top_bricks: self.top_bricks.append(brick) if len(set(b.name for b in self.top_bricks)) < len(self.top_bricks): raise ValueError("top bricks with the same name") brick_param_names = { v: k for k, v in Selector(self.top_bricks).get_params().items() } self.params = [] for param in VariableFilter(roles=[PARAMETER])(self.shared_variables): if param in brick_param_names: self.params.append((brick_param_names[param], param)) else: self.params.append((param.name, param)) self.params = OrderedDict(self.params)
def __init__(self, *args, **kwargs): super(Model, self).__init__(*args, **kwargs) bricks = [get_brick(var) for var in self.variables + self.scan_variables if get_brick(var)] children = set(chain(*(brick.children for brick in bricks))) # Quadratic complexity: we should not have thousands of # top-level bricks. self.top_bricks = [] for brick in bricks: if brick not in children and brick not in self.top_bricks: self.top_bricks.append(brick) names = Counter([brick.name for brick in self.top_bricks]) repeated_names = [name for name, count in names.items() if count > 1] if repeated_names: raise ValueError("top bricks with the same name:" " {}".format(', '.join(repeated_names))) brick_parameter_names = { v: k for k, v in Selector( self.top_bricks).get_parameters().items()} parameter_list = [] for parameter in self.parameters: if parameter in brick_parameter_names: parameter_list.append((brick_parameter_names[parameter], parameter)) else: parameter_list.append((parameter.name, parameter)) self._parameter_dict = OrderedDict(parameter_list)
def __init__(self, *args, **kwargs): super(Model, self).__init__(*args, **kwargs) bricks = [get_brick(var) for var in self.variables + self.scan_variables if get_brick(var)] children = set(chain(*(brick.children for brick in bricks))) # Quadratic complexity: we should not have thousands of # top-level bricks. self.top_bricks = [] for brick in bricks: if brick not in children and brick not in self.top_bricks: self.top_bricks.append(brick) names = Counter([brick.name for brick in self.top_bricks]) repeated_names = [name for name, count in names.items() if count > 1] if repeated_names: raise ValueError("top bricks with the same name:" " {}".format(', '.join(repeated_names))) parameter_list = [] for parameter in self.parameters: if get_brick(parameter): parameter_list.append( (get_brick(parameter).get_hierarchical_name(parameter), parameter)) else: parameter_list.append((parameter.name, parameter)) self._parameter_dict = OrderedDict(parameter_list)
def get_bricks_children(self, cg): bricks = [ get_brick(var) for var in cg.variables + cg.scan_variables if get_brick(var) ] children = set(chain(*(brick.children for brick in bricks))) return bricks, children
def __init__(self, outputs): super(Model, self).__init__(outputs) if len(self.outputs) > 1: logger.warning("model with multiple output " + multiple_message) bricks = [ get_brick(var) for var in self.variables + self.scan_variables if get_brick(var) ] children = set(chain(*(brick.children for brick in bricks))) # Quadratic complexity: we should not have thousands of # top-level bricks. self.top_bricks = [] for brick in bricks: if brick not in children and brick not in self.top_bricks: self.top_bricks.append(brick) names = Counter([brick.name for brick in self.top_bricks]) repeated_names = [name for name, count in names.items() if count > 1] if repeated_names: raise ValueError("top bricks with the same name:" " {}".format(', '.join(repeated_names))) brick_parameter_names = { v: k for k, v in Selector(self.top_bricks).get_parameters().items() } parameter_list = [] for parameter in self.parameters: if parameter in brick_parameter_names: parameter_list.append( (brick_parameter_names[parameter], parameter)) else: parameter_list.append((parameter.name, parameter)) self._parameter_dict = OrderedDict(parameter_list)
def __call__(self, parameter): # Standard Blocks parameter if get_brick(parameter) is not None: name = '{}.{}'.format( BRICK_DELIMITER.join([""] + [ brick.name for brick in get_brick(parameter).get_unique_path() ]), parameter.name) # Shared variables with tag.name elif hasattr(parameter.tag, 'name'): name = parameter.tag.name # Standard shared variable elif parameter.name is not None: name = parameter.name # Variables without names else: name = self.default_name # Handle naming collisions if name in self.used_names: i = 2 new_name = '_'.join([name, str(i)]) while new_name in self.used_names: i += 1 new_name = '_'.join([name, str(i)]) name = new_name self.used_names.add(name) return name
def __init__(self, outputs): super(Model, self).__init__(outputs) if len(self.outputs) > 1: logger.warning("model with multiple output " + multiple_message) bricks = [get_brick(var) for var in self.variables + self.scan_variables if get_brick(var)] children = set(chain(*(brick.children for brick in bricks))) # Quadratic complexity: we should not have thousands of # top-level bricks. self.top_bricks = [] for brick in bricks: if brick not in children and brick not in self.top_bricks: self.top_bricks.append(brick) if len(set(b.name for b in self.top_bricks)) < len(self.top_bricks): raise ValueError("top bricks with the same name") brick_param_names = { v: k for k, v in Selector(self.top_bricks).get_params().items()} self.params = [] for param in VariableFilter(roles=[PARAMETER])(self.shared_variables): if param in brick_param_names: self.params.append((brick_param_names[param], param)) else: self.params.append((param.name, param)) self.params = OrderedDict(self.params)
def __call__(self, parameter): # Standard Blocks parameter if get_brick(parameter) is not None: name = '{}.{}'.format( BRICK_DELIMITER.join( [""] + [brick.name for brick in get_brick(parameter).get_unique_path()]), parameter.name) # Shared variables with tag.name elif hasattr(parameter.tag, 'name'): name = parameter.tag.name # Standard shared variable elif parameter.name is not None: name = parameter.name # Variables without names else: name = self.default_name # Handle naming collisions if name in self.used_names: i = 2 new_name = '_'.join([name, str(i)]) while new_name in self.used_names: i += 1 new_name = '_'.join([name, str(i)]) name = new_name self.used_names.add(name) return name
def create_act_table(self, save_to, act_table): batch_size = 500 image_size = (28, 28) output_size = 10 convnet = create_lenet_5() layers = convnet.layers x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outmap = OrderedDict((full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])( cg.variables)) # Generate pics for biases biases = VariableFilter(roles=[BIAS])(cg.parameters) # Generate parallel array, in the same order, for outputs outs = [outmap[full_brick_name(get_brick(b))] for b in biases] # Figure work count error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) max_activation_table = (MaxActivationTable().apply( outs).copy(name='max_activation_table')) max_activation_table.tag.aggregation_scheme = ( Concatenate(max_activation_table)) model = Model([ error_rate, max_activation_table]) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) mnist_test_stream = DataStream.default_stream( self.mnist_test, iteration_scheme=SequentialScheme( self.mnist_test.num_examples, batch_size)) evaluator = DatasetEvaluator([ error_rate, max_activation_table ]) results = evaluator.evaluate(mnist_test_stream) table = results['max_activation_table'] pickle.dump(table, open(act_table, 'wb')) return table
def simple_assertions(self, updates, num_bricks=2, num_updates=4): """Shared assertions for simple tests.""" assert len(updates) == num_updates assert all(is_shared_variable(u[0]) for u in updates) # This order is somewhat arbitrary and implementation_dependent means = set(u[0] for u in updates if has_roles(u[0], [BATCH_NORM_POPULATION_MEAN])) stdevs = set(u[0] for u in updates if has_roles(u[0], [BATCH_NORM_POPULATION_STDEV])) assert means.isdisjoint(stdevs) assert len(set(get_brick(v) for v in means)) == num_bricks assert len(set(get_brick(v) for v in stdevs)) == num_bricks
def simple_assertions(self, updates, num_bricks=2, num_updates=4): """Shared assertions for simple tests.""" assert len(updates) == num_updates assert all(is_shared_variable(u[0]) for u in updates) # This order is somewhat arbitrary and implementation_dependent means = set(u[0] for u in updates if has_roles(u[0], [BATCH_NORM_POPULATION_MEAN])) stdevs = set(u[0] for u in updates if has_roles(u[0], [BATCH_NORM_POPULATION_STDEV])) assert means.isdisjoint(stdevs) assert len(set(get_brick(v) for v in means)) == num_bricks assert len(set(get_brick(v) for v in stdevs)) == num_bricks
def create_act_table(self, save_to, act_table): batch_size = 500 image_size = (28, 28) output_size = 10 convnet = create_lenet_5() layers = convnet.layers x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outmap = OrderedDict( (full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables)) # Generate pics for biases biases = VariableFilter(roles=[BIAS])(cg.parameters) # Generate parallel array, in the same order, for outputs outs = [outmap[full_brick_name(get_brick(b))] for b in biases] # Figure work count error_rate = (MisclassificationRate().apply( y.flatten(), probs).copy(name='error_rate')) max_activation_table = (MaxActivationTable().apply(outs).copy( name='max_activation_table')) max_activation_table.tag.aggregation_scheme = ( Concatenate(max_activation_table)) model = Model([error_rate, max_activation_table]) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) mnist_test_stream = DataStream.default_stream( self.mnist_test, iteration_scheme=SequentialScheme(self.mnist_test.num_examples, batch_size)) evaluator = DatasetEvaluator([error_rate, max_activation_table]) results = evaluator.evaluate(mnist_test_stream) table = results['max_activation_table'] pickle.dump(table, open(act_table, 'wb')) return table
def initialize(self, **kwargs): logger.info("BatchNormAccumulate initializing") # get list of bricks bricks_seen = set() for p in self.parameters: brick = get_brick(p) if brick not in bricks_seen: bricks_seen.add(brick) # ensure all updates account for all bricks update_parameters = set() for b in bricks_seen: for var, update in b.updates.items(): update_parameters.add(var) assert b.n.get_value() == 0 if set(update_parameters) != set(self.parameters): raise ValueError("The updates and the parameters passed in do " "not match. This could be due to no applications " "or multiple applications found %d updates, and " "%d parameters" % (len(update_parameters), len(self.parameters))) updates = dict_union(*[b.updates for b in bricks_seen]) logger.info("Compiling BatchNorm accumulate") self._func = theano.function(self.inputs, [], updates=updates, on_unused_input="warn") super(BatchNormAccumulate, self).initialize(**kwargs)
def initialize(self, **kwargs): logger.info("BatchNormAccumulate initializing") # get list of bricks bricks_seen = set() for p in self.parameters: brick = get_brick(p) if brick not in bricks_seen: bricks_seen.add(brick) # ensure all updates account for all bricks update_parameters = set() for b in bricks_seen: for var, update in b.updates.items(): update_parameters.add(var) assert b.n.get_value() == 0 if set(update_parameters) != set(self.parameters): raise ValueError("The updates and the parameters passed in do " "not match. This could be due to no applications " "or multiple applications found %d updates, and " "%d parameters" % (len(update_parameters), len(self.parameters))) updates = dict_union(*[b.updates for b in bricks_seen]) logger.info("Compiling BatchNorm accumulate") self._func = theano.function(self.inputs, [], updates=updates, on_unused_input="warn") super(BatchNormAccumulate, self).initialize(**kwargs)
def __init__(self, samples): # Extracting information from the sampling computation graph self.cg = ComputationGraph(samples) self.inputs = self.cg.inputs self.generator = get_brick(samples) if not isinstance(self.generator, BaseSequenceGenerator): raise ValueError self.generate_call = get_application_call(samples) if (not self.generate_call.application == self.generator.generate): raise ValueError self.inner_cg = ComputationGraph(self.generate_call.inner_outputs) # Fetching names from the sequence generator self.context_names = self.generator.generate.contexts self.state_names = self.generator.generate.states # Parsing the inner computation graph of sampling scan self.contexts = [ VariableFilter(bricks=[self.generator], name=name, roles=[INPUT])(self.inner_cg)[0] for name in self.context_names ] self.input_states = [] # Includes only those state names that were actually used # in 'generate' self.input_state_names = [] for name in self.generator.generate.states: var = VariableFilter(bricks=[self.generator], name=name, roles=[INPUT])(self.inner_cg) if var: self.input_state_names.append(name) self.input_states.append(var[0]) self.compiled = False
def __init__(self, beam_size, samples): self.beam_size = beam_size # Extracting information from the sampling computation graph cg = ComputationGraph(samples) self.inputs = cg.inputs self.generator = get_brick(samples) if not isinstance(self.generator, BaseSequenceGenerator): raise ValueError self.generate_call = get_application_call(samples) if not self.generate_call.application == self.generator.generate: raise ValueError self.inner_cg = ComputationGraph(self.generate_call.inner_outputs) # Fetching names from the sequence generator self.context_names = self.generator.generate.contexts self.state_names = self.generator.generate.states # Parsing the inner computation graph of sampling scan self.contexts = [ VariableFilter(bricks=[self.generator], name=name, roles=[INPUT])(self.inner_cg)[0] for name in self.context_names ] self.input_states = [] # Includes only those state names that were actually used # in 'generate' self.input_state_names = [] for name in self.generator.generate.states: var = VariableFilter(bricks=[self.generator], name=name, roles=[INPUT])(self.inner_cg) if var: self.input_state_names.append(name) self.input_states.append(var[0]) self.compiled = False
def test_application_call(): X = tensor.matrix('X') brick = TestBrick(0) Y = brick.access_application_call(X) (auxiliary_variable,) = get_application_call(Y).auxiliary_variables assert auxiliary_variable.name == 'test_val' assert get_brick(auxiliary_variable) == brick assert get_application_call(Y).auxiliary_variables[0].name == 'test_val'
def test_application_call(): X = tensor.matrix('X') brick = TestBrick() Y = brick.access_application_call(X) (auxiliary_variable,) = get_application_call(Y).auxiliary_variables assert auxiliary_variable.name == 'test_val' assert get_brick(auxiliary_variable) == brick assert get_application_call(Y).auxiliary_variables[0].name == 'test_val'
def __call__(self, obj): if isinstance(obj, SharedVariable): super(PersistentParameterID, self).__call__(obj) if hasattr(obj.tag, 'annotations'): name = '{}.{}'.format( BRICK_DELIMITER.join([brick.name for brick in get_brick(obj).get_unique_path()]), obj.name ) else: name = obj.name self.ndarray_names[id(obj.container.storage[0])] = name if id(obj) in self.ndarray_names: PersistentCudaNdarrayID.__call__(self, obj)
def infer_population(data_stream, model, n_batches): """ Sets the population parameters for a given model""" # construct a main loop with algorithm algorithm = BatchNormAccumulate(model) main_loop = MainLoop( algorithm=algorithm, data_stream=data_stream, model=model, extensions=[FinishAfter(after_n_batches=n_batches), ProgressBar()]) main_loop.run() parameters = get_batchnorm_parameters(model) batchnorm_bricks = set([get_brick(p) for p in parameters]) for b in batchnorm_bricks: b.use_population = True
def __call__(self, obj): if isinstance(obj, SharedVariable): super(PersistentParameterID, self).__call__(obj) if hasattr(obj.tag, 'annotations'): name = '{}.{}'.format( BRICK_DELIMITER.join([ brick.name for brick in get_brick(obj).get_unique_path() ]), obj.name) else: name = obj.name self.ndarray_names[id(obj.container.storage[0])] = name if id(obj) in self.ndarray_names: PersistentCudaNdarrayID.__call__(self, obj)
def __call__(self, parameter): # Standard Blocks parameter if get_brick(parameter) is not None: name = get_brick(parameter).get_hierarchical_name( parameter, SERIALIZATION_BRICK_DELIMITER) # Shared variables with tag.name elif hasattr(parameter.tag, 'name'): name = parameter.tag.name # Standard shared variable elif parameter.name is not None: name = parameter.name # Variables without names else: name = self.default_name # Handle naming collisions if name in self.used_names: i = 2 new_name = '_'.join([name, str(i)]) while new_name in self.used_names: i += 1 new_name = '_'.join([name, str(i)]) name = new_name self.used_names.add(name) return name
def infer_population(data_stream, model, n_batches): """ Sets the population parameters for a given model""" # construct a main loop with algorithm algorithm = BatchNormAccumulate(model) main_loop = MainLoop( algorithm=algorithm, data_stream=data_stream, model=model, extensions=[FinishAfter(after_n_batches=n_batches), ProgressBar()]) main_loop.run() parameters = get_batchnorm_parameters(model) batchnorm_bricks = set([get_brick(p) for p in parameters]) for b in batchnorm_bricks: b.use_population = True
def get_batch_norm_bricks(graph): """Returns the batch norm bricks (BatchNorm and BatchNorm3D) in a computation graph. Parameters ---------- graph : instance of :class:`ComputationGraph` The training computation graph. """ bricks = [] for variable in graph.variables: brick = get_brick(variable) if isinstance(brick, BatchNorm): if brick not in bricks: bricks.append(brick) return bricks
def get_batch_norm_bricks(graph): """Returns the batch norm bricks (BatchNorm and BatchNorm3D) in a computation graph. Parameters ---------- graph : instance of :class:`ComputationGraph` The training computation graph. """ bricks = [] for variable in graph.variables: brick = get_brick(variable) if isinstance(brick, BatchNorm): if brick not in bricks: bricks.append(brick) return bricks
def __call__(self, parameter): # Standard Blocks parameter if get_brick(parameter) is not None: name = get_brick(parameter).get_hierarchical_name( parameter, SERIALIZATION_BRICK_DELIMITER) # Shared variables with tag.name elif hasattr(parameter.tag, 'name'): name = parameter.tag.name # Standard shared variable elif parameter.name is not None: name = parameter.name # Variables without names else: name = self.default_name # Handle naming collisions if name in self.used_names: i = 2 new_name = '_'.join([name, str(i)]) while new_name in self.used_names: i += 1 new_name = '_'.join([name, str(i)]) name = new_name self.used_names.add(name) return name
def _create_maximum_activation_for(output, topn, dims=None): # Automatically compute the number of units if dims is None: dims = get_brick(output).get_dims(['output'])[0] if isinstance(dims, numbers.Integral): dims = (dims,) index = theano.shared(numpy.zeros((topn, dims[0]), dtype=numpy.int)) snapshot = None else: index = theano.shared(numpy.zeros((topn, dims[0], 3), dtype=numpy.int)) snapshot = theano.shared(numpy.zeros((topn,) + dims)) quantity = shared_floatx_zeros((topn, dims[0])) index.tag.for_output = output add_role(index, MAXIMUM_ACTIVATION_INDEX) quantity.tag.for_output = output add_role(quantity, MAXIMUM_ACTIVATION_QUANTITY) return (dims, quantity, index, snapshot)
def _create_maximum_activation_for(output, topn, dims=None): # Automatically compute the number of units if dims is None: dims = get_brick(output).get_dims(['output'])[0] if isinstance(dims, numbers.Integral): dims = (dims, ) index = theano.shared(numpy.zeros((topn, dims[0]), dtype=numpy.int)) snapshot = None else: index = theano.shared(numpy.zeros((topn, dims[0], 3), dtype=numpy.int)) snapshot = theano.shared(numpy.zeros((topn, ) + dims)) quantity = shared_floatx_zeros((topn, dims[0])) index.tag.for_output = output add_role(index, MAXIMUM_ACTIVATION_INDEX) quantity.tag.for_output = output add_role(quantity, MAXIMUM_ACTIVATION_QUANTITY) return (dims, quantity, index, snapshot)
def evaluate(c, tar_path, *args, **kwargs): """ Performs rudimentary evaluation of SNLI/MNLI run * Runs on valid and test given network * Saves all predictions * Saves embedding matrix * Saves results.json and predictions.csv """ # Load and configure model = kwargs['model'] assert c.endswith("json") c = json.load(open(c)) # Very ugly absolute path fix ABS_PATHS = [ "data/", "/mnt/users/jastrzebski/local/dict_based_learning/data/", "/data/cf9ffb48-61bd-40dc-a011-b2e7e5acfd72/" ] from six import string_types for abs_path in ABS_PATHS: for k in c: if isinstance(c[k], string_types): if c[k].startswith(abs_path): c[k] = c[k][len(abs_path):] # Make data paths nice for path in [ 'dict_path', 'embedding_def_path', 'embedding_path', 'vocab', 'vocab_def', 'vocab_text' ]: if c.get(path, ''): if not os.path.isabs(c[path]): c[path] = os.path.join(fuel.config.data_path[0], c[path]) logging.info("Updating config with " + str(kwargs)) c.update(**kwargs) # NOTE: This assures we don't miss crucial definition for some def heavy words # usually it is a good idea c['max_def_per_word'] = c['max_def_per_word'] * 2 assert tar_path.endswith("tar") dest_path = os.path.dirname(tar_path) prefix = os.path.splitext(os.path.basename(tar_path))[0] s1_decoded, s2_decoded = T.lmatrix('sentence1'), T.lmatrix('sentence2') if c['dict_path']: s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix( 'sentence2_def_map') def_mask = T.fmatrix("def_mask") defs = T.lmatrix("defs") else: s1_def_map, s2_def_map = None, None def_mask = None defs = None s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask') if model == 'simple': model, data, used_dict, used_retrieval, used_vocab = _initialize_simple_model_and_data( c) elif model == 'esim': model, data, used_dict, used_retrieval, used_vocab = _initialize_esim_model_and_data( c) else: raise NotImplementedError() pred = model.apply(s1_decoded, s1_mask, s2_decoded, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=False) cg = ComputationGraph([pred]) if c.get("bn", True): bn_params = [ p for p in VariableFilter(bricks=[BatchNormalization])(cg) if hasattr(p, "set_value") ] else: bn_params = [] # Load model model = Model(cg.outputs) parameters = model.get_parameter_dict() # Blocks version mismatch logging.info( "Trainable parameters" + "\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ])], width=120)) logging.info("# of parameters {}".format( sum([ np.prod(parameters[key].get_value().shape) for key in sorted([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ]) ]))) with open(tar_path) as src: params = load_parameters(src) loaded_params_set = set(params.keys()) model_params_set = set([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ]) logging.info("Loaded extra parameters") logging.info(loaded_params_set - model_params_set) logging.info("Missing parameters") logging.info(model_params_set - loaded_params_set) model.set_parameter_values(params) if c.get("bn", True): logging.info("Loading " + str([ get_brick(param).get_hierarchical_name(param) for param in bn_params ])) for param in bn_params: param.set_value( params[get_brick(param).get_hierarchical_name(param)]) for p in bn_params: model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p # Read logs logs = pd.read_csv(os.path.join(dest_path, "logs.csv")) best_val_acc = logs['valid_misclassificationrate_apply_error_rate'].min() logging.info("Best measured valid acc: " + str(best_val_acc)) # NOTE(kudkudak): We need this to have comparable mean rank and embedding scores reference_vocab = Vocabulary( os.path.join(fuel.config.data_path[0], c['data_path'], 'vocab.txt')) vocab_all = Vocabulary( os.path.join( fuel.config.data_path[0], c['data_path'], 'vocab_all.txt')) # Can include OOV words, which is interesting retrieval_all = Retrieval(vocab_text=used_vocab, dictionary=used_dict, max_def_length=c['max_def_length'], exclude_top_k=0, max_def_per_word=c['max_def_per_word']) # logging.info("Calculating dict and word embeddings for vocab.txt and vocab_all.txt") # for name in ['s1_word_embeddings', 's1_dict_word_embeddings']: # variables = VariableFilter(name=name)(cg) # if len(variables): # s1_emb = variables[0] # # A bit sloppy about downcast # # if "dict" in name: # embedder = construct_dict_embedder( # theano.function([s1_decoded, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True), # vocab=data.vocab, retrieval=retrieval_all) # else: # embedder = construct_embedder(theano.function([s1_decoded], s1_emb, allow_input_downcast=True), # vocab=data.vocab) # # for v_name, v in [("vocab_all", vocab_all), ("vocab", reference_vocab)]: # logging.info("Calculating {} embeddings for {}".format(name, v_name)) # Predict predict_fnc = theano.function(cg.inputs, pred) results = {} batch_size = 14 for subset in ['valid', 'test']: logging.info("Predicting on " + subset) stream = data.get_stream(subset, batch_size=batch_size, seed=778) it = stream.get_epoch_iterator() rows = [] for ex in tqdm.tqdm(it, total=10000 / batch_size): ex = dict(zip(stream.sources, ex)) inp = [ex[v.name] for v in cg.inputs] prob = predict_fnc(*inp) label_pred = np.argmax(prob, axis=1) for id in range(len(prob)): s1_decoded = used_vocab.decode(ex['sentence1'][id]).split() s2_decoded = used_vocab.decode(ex['sentence2'][id]).split() assert used_vocab == data.vocab s1_decoded = [ '*' + w + '*' if used_vocab.word_to_id(w) > c['num_input_words'] else w for w in s1_decoded ] s2_decoded = [ '*' + w + '*' if used_vocab.word_to_id(w) > c['num_input_words'] else w for w in s2_decoded ] # Different difficulty metrics # text_unk_percentage s1_no_pad = [w for w in ex['sentence1'][id] if w != 0] s2_no_pad = [w for w in ex['sentence2'][id] if w != 0] s1_unk_percentage = sum([ 1. for w in s1_no_pad if w == used_vocab.unk ]) / len(s1_no_pad) s2_unk_percentage = sum([ 1. for w in s1_no_pad if w == used_vocab.unk ]) / len(s2_no_pad) # mean freq word s1_mean_freq = np.mean([ 0 if w == data.vocab.unk else used_vocab._id_to_freq[w] for w in s1_no_pad ]) s2_mean_freq = np.mean([ 0 if w == data.vocab.unk else used_vocab._id_to_freq[w] for w in s2_no_pad ]) # mean rank word (UNK is max rank) # NOTE(kudkudak): Will break if we reindex unk between vocabs :P s1_mean_rank = np.mean([ reference_vocab.size() if reference_vocab.word_to_id( used_vocab.id_to_word(w)) == reference_vocab.unk else reference_vocab.word_to_id(used_vocab.id_to_word(w)) for w in s1_no_pad ]) s2_mean_rank = np.mean([ reference_vocab.size() if reference_vocab.word_to_id( used_vocab.id_to_word(w)) == reference_vocab.unk else reference_vocab.word_to_id(used_vocab.id_to_word(w)) for w in s2_no_pad ]) rows.append({ "pred": label_pred[id], "true_label": ex['label'][id], "s1": ' '.join(s1_decoded), "s2": ' '.join(s2_decoded), "s1_unk_percentage": s1_unk_percentage, "s2_unk_percentage": s2_unk_percentage, "s1_mean_freq": s1_mean_freq, "s2_mean_freq": s2_mean_freq, "s1_mean_rank": s1_mean_rank, "s2_mean_rank": s2_mean_rank, "p_0": prob[id, 0], "p_1": prob[id, 1], "p_2": prob[id, 2] }) preds = pd.DataFrame(rows, columns=rows[0].keys()) preds.to_csv( os.path.join(dest_path, prefix + '_predictions_{}.csv'.format(subset))) results[subset] = {} results[subset]['misclassification'] = 1 - np.mean( preds.pred == preds.true_label) if subset == "valid" and np.abs( (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc) > 0.001: logging.error("!!!") logging.error( "Found different best_val_acc. Probably due to changed specification of the model class." ) logging.error("Discrepancy {}".format( (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc)) logging.error("!!!") logging.info(results) json.dump(results, open(os.path.join(dest_path, prefix + '_results.json'), "w"))
def __init__(self): srng = MRG_RandomStreams(seed=123) X = T.matrix('features') self.X = X #drop = Dropout(p_drop=0.5) #o = drop.apply(X) o = (X - 128) / 128.0 self.scaled = o #n_hidden = 64 n_hidden = 2048 * 2 n_zs = 1024 self.n_zs = n_zs self.n_hidden = n_hidden l = Linear(input_dim=32 * 32 * 3, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Rectifier().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Rectifier().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() mu_encoder = l.apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() log_sigma_encoder = l.apply(o) eps = srng.normal(log_sigma_encoder.shape) z = eps * T.exp(log_sigma_encoder) + mu_encoder z_to_h1_decode = Linear(input_dim=n_zs, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) z_to_h1_decode.initialize() h1_decode_to_h_decode = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) h1_decode_to_h_decode.initialize() h_decode_produce = Linear(input_dim=n_hidden, output_dim=32 * 32 * 3, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #o = h_decode_produce.apply(h_decoder) h_decode_produce = Linear(input_dim=n_hidden, output_dim=32 * 32 * 3, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #self.produced = Sigmoid().apply(o) seq = Sequence([ z_to_h1_decode.apply, Rectifier().apply, h1_decode_to_h_decode.apply, Rectifier().apply, h_decode_produce.apply, Sigmoid().apply ]) seq.initialize() self.produced = seq.apply(z) self.cost = T.mean(T.sqr(self.produced - self.scaled)) #self.cost = T.sum(T.nnet.binary_crossentropy(self.produced, self.scaled)) #T.sum(T.sqr(self.produced - self.scaled)) self.cost.name = "cost" self.variational_cost = - 0.5 * T.mean(1 + 2*log_sigma_encoder - mu_encoder * mu_encoder\ - T.exp(2 * log_sigma_encoder)) + self.cost self.variational_cost.name = "variational_cost" self.Z = T.matrix('z') self.sampled = seq.apply(self.Z) cg = ComputationGraph([self.variational_cost]) bricks = [ get_brick(var) for var in cg.variables + cg.scan_variables if get_brick(var) ] for i, b in enumerate(bricks): b.name = b.name + "_" + str(i)
def get_top_brick(self, param): brick = get_brick(param) while len(brick.parents) > 0 and not isinstance( brick, DependencyRecognizer): brick = brick.parents[0] return brick
def main(save_to, hist_file): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) mnist_test = MNIST(("test", ), sources=['features', 'targets']) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) confusion = (ConfusionMatrix().apply(y.flatten(), probs).copy(name='confusion')) confusion.tag.aggregation_scheme = Sum(confusion) model = Model([error_rate, confusion]) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outs = OrderedDict( (full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(model.variables)) # Load histogram information with open(hist_file, 'rb') as handle: histograms = pickle.load(handle) # Corpora mnist_train = MNIST(("train", )) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size)) # Probe the given layer target_layer = '/lenet/mlp/linear_0' next_layer_param = '/lenet/mlp/linear_1.W' sample = extract_sample(outs[target_layer], mnist_test_stream) print('sample shape', sample.shape) # Figure neurons to ablate hist = histograms[('linear_1', 'b')] targets = [i for i in range(hist.shape[1]) if hist[2, i] * hist[7, i] < 0] print('ablating', len(targets), ':', targets) # Now adjust the next layer weights based on the probe param = model.get_parameter_dict()[next_layer_param] print('param shape', param.get_value().shape) new_weights = ablate_inputs(targets, sample, param.get_value(), compensate=False) param.set_value(new_weights) # Evaluation pass evaluator = DatasetEvaluator([error_rate, confusion]) print(evaluator.evaluate(mnist_test_stream))
def get_parameter_name(parameter): return "%s%s" % (_get_name(get_brick(parameter)), Path.ParameterName(parameter).part())
o = l.apply(o) o = Softmax().apply(o) Y = T.imatrix(name="targets") cost = CategoricalCrossEntropy().apply(Y.flatten(), o) cost.name = "cost" miss_class = 1.0 - MisclassificationRate().apply(Y.flatten(), o) miss_class.name = "accuracy" cg = ComputationGraph(cost) print cg.shared_variables bricks = [get_brick(var) for var in cg.variables if get_brick(var)] for i, b in enumerate(bricks): b.name += str(i) step_rule = AdaM() algorithm = GradientDescent(cost=cost, step_rule=step_rule) print "Loading data" mnist_train = MNIST("train") train_stream = DataStream(dataset=mnist_train, iteration_scheme=SequentialScheme( num_examples=mnist_train.num_examples, batch_size=128)) #iteration_scheme= SequentialScheme(num_examples= 1000, batch_size= 128)) mnist_test = MNIST("test")
def create_main_loop(save_to, num_epochs, unit_order=None, batch_size=500, num_batches=None): image_size = (28, 28) output_size = 10 convnet = create_lenet_5() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) case_costs = CasewiseCrossEntropy().apply(y.flatten(), probs) cost = case_costs.mean().copy(name='cost') # cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) # .copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) cg = ComputationGraph([cost, error_rate]) # Apply regularization to the cost weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + sum([0.0003 * (W**2).sum() for W in weights]) cost.name = 'cost_with_regularization' mnist_train = MNIST(("train", )) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size)) # Generate pics for biases biases = VariableFilter(roles=[BIAS])(cg.parameters) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=AdaDelta()) # Find layer outputs to probe outs = OrderedDict( reversed( list((get_brick(out).name, out) for out in VariableFilter(roles=[OUTPUT], bricks=[Convolutional, Linear])( cg.variables)))) actpic_extension = ActpicExtension(actpic_variables=outs, case_labels=y, pics=x, label_count=output_size, rectify=-1, data_stream=mnist_test_stream, after_batch=True) synpic_extension = SynpicExtension(synpic_parameters=biases, case_costs=case_costs, case_labels=y, pics=x, batch_size=batch_size, pic_size=image_size, label_count=output_size, after_batch=True) # Impose an orderint for the SaveImages extension if unit_order is not None: with open(unit_order, 'rb') as handle: histograms = pickle.load(handle) unit_order = compute_unit_order(histograms) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), actpic_extension, synpic_extension, SaveImages(picsources=[synpic_extension, actpic_extension], title="LeNet-5: batch {i}, " + "cost {cost_with_regularization:.2f}, " + "trainerr {error_rate:.3f}", data=[cost, error_rate], graph='error_rate', graph_len=500, unit_order=unit_order, after_batch=True), DataStreamMonitoring([cost, error_rate], mnist_test_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing() ] model = Model(cost) main_loop = MainLoop(algorithm, mnist_train_stream, model=model, extensions=extensions) return main_loop
def initialize_graph(recognizer, data, config, params): # Separate attention_params to be handled differently # when regularization is applied attentions = recognizer.all_children().generator.transition.attention.get() attention_params = [Selector(attention).get_parameters().values() for attention in attentions] logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) observables = [] # monitored each batch cg = recognizer.get_cost_graph(batch=True) labels = [] labels_mask = [] for chld in recognizer.children: lbls = VariableFilter(applications=[chld.cost], name='labels'+chld.names_postfix)(cg) lbls_mask = VariableFilter(applications=[chld.cost], name='labels_mask'+chld.names_postfix)(cg) if len(lbls) == 1: labels += lbls labels_mask += lbls_mask batch_cost = cg.outputs[0].sum() batch_size = rename(labels[0].shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=recognizer.all_children().bottom.apply.get(), name_regex="output")( cost_cg) attended = VariableFilter( applications=recognizer.all_children().generator.transition.apply.get(), name="attended")( cost_cg) attended_mask = VariableFilter( applications=recognizer.all_children().generator.transition.apply.get(), name="attended_mask")( cost_cg) weights = VariableFilter( applications=recognizer.all_children().generator.evaluate.get(), name="weights")( cost_cg) def get_renamed_list(rlist, elem_func, elem_name): return [rename(elem_func(elem), elem_name+chld.names_postfix) for elem,chld in zip(rlist, recognizer.children)] max_sentence_lengths = get_renamed_list(bottom_output, lambda e: e.shape[0], "max_sentence_length") max_attended_mask_lengths = get_renamed_list(attended_mask, lambda e: e.shape[0], "max_attended_mask_length") max_attended_lengths = get_renamed_list(attended, lambda e: e.shape[0], "max_attended_length") max_num_characters = get_renamed_list(labels, lambda e: e.shape[0], "max_num_characters") mean_attended = get_renamed_list(attended, lambda e: abs(e).mean(), "mean_attended") mean_bottom_output = get_renamed_list(bottom_output, lambda e: abs(e).mean(), "mean_bottom_output") mask_density = get_renamed_list(labels_mask, lambda e: e.mean(), "mask_density") weights_entropy = [rename(entropy(w, lm), "weights_entropy"+chld.names_postfix) for w, lm, chld in zip(weights, labels_mask, recognizer.children)] observables += max_attended_lengths + max_attended_mask_lengths + max_sentence_lengths # # Monitoring of cost terms is tricky because of Blocks #514 - since the # costs are annotations that are not part of the original output graph, # they are unaffected by replacements such as dropout!! # cost_terms = [] for chld in recognizer.children: chld_cost_terms = VariableFilter(applications=[chld.generator.evaluate], name_regex='.*_nll')(cost_cg) chld_cost_terms = [rename(var, var.name[:-4] + chld.names_postfix + '_nll') for var in chld_cost_terms] cost_terms += chld_cost_terms cg = ComputationGraph([cost, batch_size] + weights_entropy + mean_attended + mean_bottom_output + max_num_characters + mask_density + cost_terms) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): drop_conf = reg_config['dropout'] bot_drop = drop_conf.get('bottom', 0.0) if bot_drop: logger.info('apply bottom dropout') regularized_cg = apply_dropout(regularized_cg, bottom_output, bot_drop) enc_drop = drop_conf.get('encoder', 0.0) if enc_drop: logger.info('apply encoder dropout') enc_bricks = reduce(lambda acc,x: acc+list(x), recognizer.all_children().encoder.children.get(), []) enc_states = VariableFilter(bricks=enc_bricks, name_regex='states')(regularized_cg) regularized_cg = apply_dropout(regularized_cg, enc_states, enc_drop) post_merge_drop = drop_conf.get('post_merge', 0.0) if post_merge_drop: logger.info('apply post_merge dropout') pm_bricks = [] for chld in recognizer.children: cpm_bricks = list(chld.generator.readout.post_merge.children) cpm_bricks += cpm_bricks[-1].children cpm_bricks = [b for b in cpm_bricks if isinstance(b, type(chld.post_merge_activation))] pm_bricks += cpm_bricks regularized_cg = apply_dropout( regularized_cg, VariableFilter(bricks=pm_bricks, name='output')(regularized_cg), post_merge_drop) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = (train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2) train_cost = train_cost.copy(name='train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=SpeechModel(regularized_cg.outputs[0] ).get_parameter_dict().values(), **reg_config.get('adaptive_noise') ) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1]] # model prior variance if len(cost_terms): # Please note - the aggragation (mean) is done in # "attach_aggregation_schemes" ct_names = [v.name for v in cost_terms] for v in regularized_cg.outputs: if v.name in ct_names: observables.append(rename(v.sum()/batch_size, v.name)) for chld in recognizer.children: if chld.train_tags: tags_cost = VariableFilter(applications=[chld.addTagCost], name='output')(regularized_cg)[0] observables += [rename(tags_cost.sum()/batch_size, 'tags_nll'+chld.names_postfix)] # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise param_values = load_parameter_values(params) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] return { 'observables': observables, 'max_norm_rules': max_norm_rules, 'cg': cg, 'regularized_cg' : regularized_cg, 'train_cost' : train_cost, 'cost' : cost, 'batch_size' : batch_size, 'batch_cost' : batch_cost, 'parameters' : parameters, 'gradients': gradients, 'model' : model, 'data' : data, 'recognizer' : recognizer, 'weights_entropy' : weights_entropy, 'labels_mask' : labels_mask, 'labels' : labels }
def __init__(self): srng = MRG_RandomStreams(seed=123) X = T.matrix('features') self.X = X #drop = Dropout(p_drop=0.5) #o = drop.apply(X) o = (X - 128) / 128.0 self.scaled = o #n_hidden = 64 n_hidden = 2048 * 2 n_zs = 1024 self.n_zs = n_zs self.n_hidden = n_hidden l = Linear(input_dim=32*32*3, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Rectifier().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Rectifier().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() mu_encoder = l.apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() log_sigma_encoder = l.apply(o) eps = srng.normal(log_sigma_encoder.shape) z = eps * T.exp(log_sigma_encoder) + mu_encoder z_to_h1_decode = Linear(input_dim=n_zs, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) z_to_h1_decode.initialize() h1_decode_to_h_decode = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) h1_decode_to_h_decode.initialize() h_decode_produce = Linear(input_dim=n_hidden, output_dim=32*32*3, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #o = h_decode_produce.apply(h_decoder) h_decode_produce = Linear(input_dim=n_hidden, output_dim=32*32*3, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #self.produced = Sigmoid().apply(o) seq = Sequence([z_to_h1_decode.apply, Rectifier().apply, h1_decode_to_h_decode.apply, Rectifier().apply, h_decode_produce.apply, Sigmoid().apply]) seq.initialize() self.produced = seq.apply(z) self.cost = T.mean(T.sqr(self.produced - self.scaled)) #self.cost = T.sum(T.nnet.binary_crossentropy(self.produced, self.scaled)) #T.sum(T.sqr(self.produced - self.scaled)) self.cost.name = "cost" self.variational_cost = - 0.5 * T.mean(1 + 2*log_sigma_encoder - mu_encoder * mu_encoder\ - T.exp(2 * log_sigma_encoder)) + self.cost self.variational_cost.name = "variational_cost" self.Z = T.matrix('z') self.sampled = seq.apply(self.Z) cg = ComputationGraph([self.variational_cost]) bricks = [get_brick(var) for var in cg.variables + cg.scan_variables if get_brick(var)] for i, b in enumerate(bricks): b.name = b.name + "_" + str(i)
def main(save_to): batch_size = 500 image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction convnet = create_lenet_5() mnist_test = MNIST(("test",), sources=['features', 'targets']) basis_init = create_fair_basis(mnist_test, 10, 2) # b = shared_floatx(basis) # random_init = numpy.rand.random(100, 1000) # r = shared_floatx(random_init) # rn = r / r.norm(axis=1) # x = tensor.dot(rn, tensor.shape_padright(b)) x = shared_floatx(basis_init) # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) learning_rate = shared_floatx(0.01, 'learning_rate') unit = shared_floatx(0, 'unit', dtype='int64') negate = False suffix = '_negsynth.jpg' if negate else '_synth.jpg' for output in outs: layer = get_brick(output) # For now, skip masks -for some reason they are always NaN iterations = 10000 layername = layer.parents[0].name + '-' + layer.name # if layername != 'noisylinear_2-linear': # continue dims = layer.get_dims(['output'])[0] if negate: measure = -output else: measure = output measure = measure[(slice(0, basis_init.shape[0]), ) + (slice(None),) * (measure.ndim - 1)] if isinstance(dims, numbers.Integral): dims = (dims, ) costvec = -tensor.log(tensor.nnet.softmax( measure)[:,unit].flatten()) else: flatout = measure.flatten(ndim=3) maxout = flatout.max(axis=2) costvec = -tensor.log(tensor.nnet.softmax( maxout)[:,unit].flatten()) # Add a regularization to favor gray images. # cost = costvec.sum() + (x - 0.5).norm(2) * ( # 10.0 / basis_init.shape[0]) cost = costvec.sum() grad = gradient.grad(cost, x) stepx = x - learning_rate * grad normx = stepx / tensor.shape_padright( stepx.flatten(ndim=2).max(axis=1), n_ones=3) newx = tensor.clip(normx, 0, 1) newx = newx[(slice(0, basis_init.shape[0]), ) + (slice(None),) * (newx.ndim - 1)] fn = theano.function([], [cost], updates=[(x, newx)]) filmstrip = Filmstrip( basis_init.shape[-2:], (dims[0], basis_init.shape[0]), background='red') for u in range(dims[0]): unit.set_value(u) x.set_value(basis_init) print('layer', layername, 'unit', u) for index in range(iterations): c = fn()[0] if index % 1000 == 0: print('cost', c) result = x.get_value() for i2 in range(basis_init.shape[0]): filmstrip.set_image((u, i2), result[i2,:,:,:]) filmstrip.save(layername + suffix) result = x.get_value() for index in range(basis_init.shape[0]): filmstrip.set_image((u, index), result[index,:,:,:]) filmstrip.save(layername + suffix)
def main(save_to): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) random_init = (numpy.random.rand(100, 1, 28, 28) * 128).astype('float32') layers = [l for l in convnet.layers if isinstance(l, Convolutional)] mnist_test = MNIST(("test", ), sources=['features', 'targets']) basis_init = create_fair_basis(mnist_test, 10, 50) basis_set = make_shifted_basis(basis_init, convnet, layers) for layer, basis in zip(layers, basis_set): # basis is 5d: # (probed_units, base_cases, 1-c, 28-y, 28-x) b = shared_floatx(basis) # coefficients is 2d: # (probed_units, base_cases) coefficients = shared_floatx( numpy.ones(basis.shape[0:2], dtype=theano.config.floatX)) # prod is 5d: (probed_units, base_cases, 1-c, 28-y, 28-x) prod = tensor.shape_padright(coefficients, 3) * b # x is 4d: (probed_units, 1-c, 28-y, 28-x) ux = prod.sum(axis=1) x = tensor.clip( ux / tensor.shape_padright(ux.flatten(ndim=2).max(axis=1), 3), 0, 1) # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter(roles=[OUTPUT], bricks=[layer])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) learning_rate = shared_floatx(0.03, 'learning_rate') # We will try to do all units at once. # unit = shared_floatx(0, 'unit', dtype='int64') # But we are only doing one layer at once. output = outs[0] dims = layer.get_dims(['output'])[0] if isinstance(dims, numbers.Integral): # FC case: output is 2d: (probed_units, units) dims = (dims, ) unitrange = tensor.arange(dims[0]) costvec = -tensor.log( tensor.nnet.softmax(output)[unitrange, unitrage].flatten()) else: # Conv case: output is 4d: (probed_units, units, y, x) unitrange = tensor.arange(dims[0]) print('dims is', dims) costvec = -tensor.log( tensor.nnet.softmax(output[unitrange, unitrange, dims[1] // 2, dims[2] // 2]).flatten()) cost = costvec.sum() # grad is dims (probed_units, basis_size) grad = gradient.grad(cost, coefficients) stepc = coefficients # - learning_rate * grad newc = stepc / tensor.shape_padright(stepc.mean(axis=1)) fn = theano.function([], [cost, x], updates=[(coefficients, newc)]) filmstrip = Filmstrip(random_init.shape[-2:], (dims[0], 1), background='red') layer = get_brick(output) learning_rate.set_value(0.1) for index in range(20000): c, result = fn() if index % 1000 == 0: learning_rate.set_value(numpy.cast[theano.config.floatX]( learning_rate.get_value() * 0.8)) print('cost', c) for u in range(dims[0]): filmstrip.set_image((u, 0), result[u, :, :, :]) filmstrip.save(layer.name + '_stroke.jpg') for u in range(dims[0]): filmstrip.set_image((u, 0), result[u, :, :, :]) filmstrip.save(layer.name + '_stroke.jpg')
def __init__(self): srng = MRG_RandomStreams(seed=123) X = T.matrix('features') self.X = X #drop = Dropout(p_drop=0.5) #o = drop.apply(X) o = X self.noisy = o #n_hidden = 64 n_hidden = 128 n_zs = 2 self.n_zs = n_zs self.n_hidden = n_hidden l = Linear(input_dim=28*28, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Tanh().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Tanh().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(.101), biases_init=Constant(0)) l.initialize() mu_encoder = l.apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) l.initialize() log_sigma_encoder = l.apply(o) eps = srng.normal(log_sigma_encoder.shape) z = eps * T.exp(log_sigma_encoder) + mu_encoder z_to_h1_decode = Linear(input_dim=n_zs, output_dim=n_hidden, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) z_to_h1_decode.initialize() h1_decode_to_h_decode = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) h1_decode_to_h_decode.initialize() #o = z_to_h_decode.apply(z) #h_decoder = Tanh().apply(o) h_decode_produce = Linear(input_dim=n_hidden, output_dim=28*28, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #o = h_decode_produce.apply(h_decoder) #self.produced = Sigmoid().apply(o) seq = Sequence([z_to_h1_decode.apply, Tanh().apply, h1_decode_to_h_decode.apply, Tanh().apply, h_decode_produce.apply, Sigmoid().apply]) seq.initialize() self.produced = seq.apply(z) self.cost = T.sum(T.sqr(self.produced - X)) #regular old mean squared #self.cost = T.sum(T.nnet.binary_crossentropy(self.produced, X)) #T.sum(T.sqr(self.produced - X)) self.cost.name = "cost" # Computed with L = 1, only one sample of produced. logpxz = T.sum(-1 * log_sigma_encoder * T.log(2*np.pi) - T.sqr((self.produced - X) / (2*T.exp(log_sigma_encoder)))) self.variational_cost = - 0.5 * T.sum(1 + 2*log_sigma_encoder - mu_encoder * mu_encoder\ - T.exp(2 * log_sigma_encoder)) + logpxz self.variational_cost.name = "variational_cost" self.Z = T.matrix('z') self.sampled = seq.apply(self.Z) cg = ComputationGraph([self.variational_cost]) bricks = [get_brick(var) for var in cg.variables + cg.scan_variables if get_brick(var)] for i, b in enumerate(bricks): b.name = b.name + "_" + str(i)
def main(save_to): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('features') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) algorithm = MaximumActivationSearch(outputs=outs) # Use the mnist test set, unshuffled mnist_test = MNIST(("test",), sources=['features']) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size)) extensions = [Timing(), FinishAfter(after_n_epochs=1), DataStreamMonitoring( [], mnist_test_stream, prefix="test"), Checkpoint("maxact.tar"), ProgressBar(), Printing()] main_loop = MainLoop( algorithm, mnist_test_stream, model=model, extensions=extensions) main_loop.run() examples = mnist_test.get_example_stream() example = examples.get_data(0)[0] layers = convnet.layers for output, record in algorithm.maximum_activations.items(): layer = get_brick(output) activations, indices, snapshots = ( r.get_value() if r else None for r in record[1:]) filmstrip = Filmstrip( example.shape[-2:], (indices.shape[1], indices.shape[0]), background='blue') if layer in layers: fieldmap = layerarray_fieldmap(layers[0:layers.index(layer) + 1]) for unit in range(indices.shape[1]): for index in range(100): mask = make_mask(example.shape[-2:], fieldmap, numpy.clip( snapshots[index, unit, :, :], 0, numpy.inf)) imagenum = indices[index, unit, 0] filmstrip.set_image((unit, index), examples.get_data(imagenum)[0], mask) else: for unit in range(indices.shape[1]): for index in range(100): imagenum = indices[index, unit] filmstrip.set_image((unit, index), examples.get_data(imagenum)[0]) filmstrip.save(layer.name + '_maxact.jpg')
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph(batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter(applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append(rename(gain_matrix.min(), 'min_gain')) primary_observables.append(rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter(applications=[r.generator.readout.readout], name="output_0")(cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")(cost_cg)[-1] attended, = VariableFilter(applications=[r.generator.transition.apply], name="attended")(cost_cg) attended_mask, = VariableFilter(applications=[ r.generator.transition.apply ], name="attended_mask")(cost_cg) weights, = VariableFilter(applications=[r.generator.evaluate], name="weights")(cost_cg) from blocks.roles import AUXILIARY l2_cost, = VariableFilter(roles=[AUXILIARY], theano_name='l2_cost_aux')(cost_cg) cost_forward, = VariableFilter(roles=[AUXILIARY], theano_name='costs_forward_aux')(cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename( abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density ]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [ p for p in cg.parameters if p not in attention_params ] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = ( train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model( regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise')) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1] ] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append( AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [ v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable) ] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat( [name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([ name for name, p in parameters.items() if not p in maxnorm_subjects ])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects) ] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold ] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements**0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5 step_norm = algorithm.steps[param].norm(2) / num_elements**0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length ] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty ] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append( rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append( rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append( Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append( TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables + [l2_cost, cost_forward]), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions(before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append( AdaptiveClipping(algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')).add_condition( ["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [ average_monitoring.record_name(train_cost), validation.record_name(cost) ], # Plot 2: gradient norm, [ average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold) ], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [ average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label') ], # Plot 5: training and validation monotonicity penalty [ average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording') ] ] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server), ] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True).add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension, )).add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension, )), ProgressBar() ] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains(labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name ] extensions.append(Patience(**patience_conf)) extensions.append( Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def __init__(self): srng = MRG_RandomStreams(seed=123) X = T.matrix('features') self.X = X #drop = Dropout(p_drop=0.5) #o = drop.apply(X) o = X self.noisy = o #n_hidden = 64 n_hidden = 128 n_zs = 2 self.n_zs = n_zs self.n_hidden = n_hidden l = Linear(input_dim=28 * 28, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Tanh().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) l.initialize() o = l.apply(o) o = Tanh().apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(.101), biases_init=Constant(0)) l.initialize() mu_encoder = l.apply(o) l = Linear(input_dim=n_hidden, output_dim=n_zs, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) l.initialize() log_sigma_encoder = l.apply(o) eps = srng.normal(log_sigma_encoder.shape) z = eps * T.exp(log_sigma_encoder) + mu_encoder z_to_h1_decode = Linear(input_dim=n_zs, output_dim=n_hidden, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) z_to_h1_decode.initialize() h1_decode_to_h_decode = Linear(input_dim=n_hidden, output_dim=n_hidden, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) h1_decode_to_h_decode.initialize() #o = z_to_h_decode.apply(z) #h_decoder = Tanh().apply(o) h_decode_produce = Linear(input_dim=n_hidden, output_dim=28 * 28, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="linear4") h_decode_produce.initialize() #o = h_decode_produce.apply(h_decoder) #self.produced = Sigmoid().apply(o) seq = Sequence([ z_to_h1_decode.apply, Tanh().apply, h1_decode_to_h_decode.apply, Tanh().apply, h_decode_produce.apply, Sigmoid().apply ]) seq.initialize() self.produced = seq.apply(z) self.cost = T.sum(T.sqr(self.produced - X)) #regular old mean squared #self.cost = T.sum(T.nnet.binary_crossentropy(self.produced, X)) #T.sum(T.sqr(self.produced - X)) self.cost.name = "cost" # Computed with L = 1, only one sample of produced. logpxz = T.sum(-1 * log_sigma_encoder * T.log(2 * np.pi) - T.sqr((self.produced - X) / (2 * T.exp(log_sigma_encoder)))) self.variational_cost = - 0.5 * T.sum(1 + 2*log_sigma_encoder - mu_encoder * mu_encoder\ - T.exp(2 * log_sigma_encoder)) + logpxz self.variational_cost.name = "variational_cost" self.Z = T.matrix('z') self.sampled = seq.apply(self.Z) cg = ComputationGraph([self.variational_cost]) bricks = [ get_brick(var) for var in cg.variables + cg.scan_variables if get_brick(var) ] for i, b in enumerate(bricks): b.name = b.name + "_" + str(i)
def main(save_to, num_epochs, weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 prior_noise_level = -10 noise_step_rule = Scale(1e-6) noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX)) convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True, noise_rate=noise_rate, prior_noise_level=prior_noise_level) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): with training_noise(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # for annealing nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX)) nit_penalty.name = 'nit_penalty' # Compute noise rates for training graph train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables) train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean() train_mean_log_sigma.name = 'mean_log_sigma' train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables) train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean() train_nit_rate.name = 'nit_rate' train_nit_regularization = nit_penalty * train_nit_rate train_nit_regularization.name = 'nit_regularization' # Apply regularization to the cost trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])( train_cg.parameters) mask_parameters = [p for p in trainable_parameters if get_brick(p).name == 'mask'] noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters) biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) nonmask_weights = [p for p in weights if get_brick(p).name != 'mask'] l2_norm = sum([(W ** 2).sum() for W in nonmask_weights]) l2_norm.name = 'l2_norm' l2_regularization = weight_decay * l2_norm l2_regularization.name = 'l2_regularization' # testversion test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + l2_regularization + train_nit_regularization train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 128 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # Create a step rule that reduces the learning rate of noise scale_mask = Restrict(noise_step_rule, mask_parameters) step_rule = CompositeRule([scale_mask, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=trainable_parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) exp_name = save_to.replace('.%d', '') # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (50, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), EpochSchedule(noise_step_rule.learning_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4) ]), EpochSchedule(noise_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4), # (6, 3e-4), # (8, 1e-3), # Causes nit rate to jump # (10, 3e-3), # (12, 1e-2), # (15, 3e-2), # (19, 1e-1), # (24, 3e-1), # (30, 1) ]), NoiseExtension( noise_parameters=noise_parameters), NoisyDataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, noise_parameters=noise_parameters, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_nit_rate, train_cost_without_regularization, l2_regularization, train_nit_regularization, momentum.learning_rate, train_mean_log_sigma, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + exp_name, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_nit_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ['train_mean_log_sigma'], ], every_n_batches=17), Plot('Test performance for ' + exp_name, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(exp_name, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def __init__(self, save_to): batch_size = 500 image_size = (28, 28) output_size = 10 convnet = create_lenet_5() layers = convnet.layers logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) mnist_test = MNIST(("test",), sources=['features', 'targets']) basis = create_fair_basis(mnist_test, 10, 10) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outs = OrderedDict((full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])( cg.variables)) # Normalize input and apply the convnet error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) confusion = (ConfusionMatrix().apply(y.flatten(), probs) .copy(name='confusion')) confusion.tag.aggregation_scheme = Sum(confusion) confusion_image = (ConfusionImage().apply(y.flatten(), probs, x) .copy(name='confusion_image')) confusion_image.tag.aggregation_scheme = Sum(confusion_image) model = Model( [error_rate, confusion, confusion_image] + list(outs.values())) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size)) self.model = model self.mnist_test_stream = mnist_test_stream self.evaluator = DatasetEvaluator( [error_rate, confusion, confusion_image]) self.base_results = self.evaluator.evaluate(mnist_test_stream) # TODO: allow target layer to be parameterized self.target_layer = '/lenet/mlp/linear_0' self.next_layer_param = '/lenet/mlp/linear_1.W' self.base_sample = extract_sample( outs[self.target_layer], mnist_test_stream) self.base_param_value = ( model.get_parameter_dict()[ self.next_layer_param].get_value().copy())
o = Softmax().apply(o) Y = T.imatrix(name="targets") cost = CategoricalCrossEntropy().apply(Y.flatten(), o) cost.name = "cost" miss_class = 1.0 - MisclassificationRate().apply(Y.flatten(), o) miss_class.name = "accuracy" cg = ComputationGraph(cost) print cg.shared_variables bricks = [get_brick(var) for var in cg.variables if get_brick(var)] for i, b in enumerate(bricks): b.name += str(i) step_rule = AdaM() algorithm = GradientDescent(cost=cost, step_rule=step_rule) print "Loading data" mnist_train = MNIST("train") train_stream = DataStream( dataset=mnist_train, iteration_scheme= SequentialScheme(num_examples= mnist_train.num_examples, batch_size= 128)) #iteration_scheme= SequentialScheme(num_examples= 1000, batch_size= 128)) mnist_test = MNIST("test") test_stream = DataStream(
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph( batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter( applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter( applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append( rename(gain_matrix.min(), 'min_gain')) primary_observables.append( rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")( cost_cg)[-1] attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = (train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model(regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise') ) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1]] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append( BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')) .add_condition(["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [average_monitoring.record_name(train_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server),] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar()] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains( labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name] extensions.append(Patience(**patience_conf)) extensions.append(Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def train_snli_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed, model='simple'): if config['exclude_top_k'] > config['num_input_words'] and config[ 'num_input_words'] > 0: raise Exception("Some words have neither word nor def embedding") c = config logger = configure_logger(name="snli_baseline_training", log_file=os.path.join(save_path, "log.txt")) if not os.path.exists(save_path): logger.info("Start a new job") os.mkdir(save_path) else: logger.info("Continue an existing job") with open(os.path.join(save_path, "cmd.txt"), "w") as f: f.write(" ".join(sys.argv)) # Make data paths nice for path in [ 'dict_path', 'embedding_def_path', 'embedding_path', 'vocab', 'vocab_def', 'vocab_text' ]: if c.get(path, ''): if not os.path.isabs(c[path]): c[path] = os.path.join(fuel.config.data_path[0], c[path]) main_loop_path = os.path.join(save_path, 'main_loop.tar') main_loop_best_val_path = os.path.join(save_path, 'main_loop_best_val.tar') stream_path = os.path.join(save_path, 'stream.pkl') # Save config to save_path json.dump(config, open(os.path.join(save_path, "config.json"), "w")) if model == 'simple': nli_model, data, used_dict, used_retrieval, _ = _initialize_simple_model_and_data( c) elif model == 'esim': nli_model, data, used_dict, used_retrieval, _ = _initialize_esim_model_and_data( c) else: raise NotImplementedError() # Compute cost s1, s2 = T.lmatrix('sentence1'), T.lmatrix('sentence2') if c['dict_path']: assert os.path.exists(c['dict_path']) s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix( 'sentence2_def_map') def_mask = T.fmatrix("def_mask") defs = T.lmatrix("defs") else: s1_def_map, s2_def_map = None, None def_mask = None defs = None s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask') y = T.ivector('label') cg = {} for train_phase in [True, False]: # NOTE: Please don't change outputs of cg if train_phase: with batch_normalization(nli_model): pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) else: pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) cost = CategoricalCrossEntropy().apply(y.flatten(), pred) error_rate = MisclassificationRate().apply(y.flatten(), pred) cg[train_phase] = ComputationGraph([cost, error_rate]) # Weight decay (TODO: Make it less bug prone) if model == 'simple': weights_to_decay = VariableFilter( bricks=[dense for dense, relu, bn in nli_model._mlp], roles=[WEIGHT])(cg[True].variables) weight_decay = np.float32(c['l2']) * sum( (w**2).sum() for w in weights_to_decay) elif model == 'esim': weight_decay = 0.0 else: raise NotImplementedError() final_cost = cg[True].outputs[0] + weight_decay final_cost.name = 'final_cost' # Add updates for population parameters if c.get("bn", True): pop_updates = get_batch_normalization_updates(cg[True]) extra_updates = [(p, m * 0.1 + p * (1 - 0.1)) for p, m in pop_updates] else: pop_updates = [] extra_updates = [] if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: loaded_params = load_parameters(src) cg[True].set_parameter_values(loaded_params) for param, m in pop_updates: param.set_value(loaded_params[get_brick( param).get_hierarchical_name(param)]) if os.path.exists(os.path.join(save_path, "main_loop.tar")): logger.warning("Manually loading BN stats :(") with open(os.path.join(save_path, "main_loop.tar")) as src: loaded_params = load_parameters(src) for param, m in pop_updates: param.set_value( loaded_params[get_brick(param).get_hierarchical_name(param)]) if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', batch_size=4).get_epoch_iterator()) s1.tag.test_value = test_value_data[0] s1_mask.tag.test_value = test_value_data[1] s2.tag.test_value = test_value_data[2] s2_mask.tag.test_value = test_value_data[3] y.tag.test_value = test_value_data[4] # Freeze embeddings if not c['train_emb']: frozen_params = [ p for E in nli_model.get_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params) & set(train_params)) > 0 else: frozen_params = [] if not c.get('train_def_emb', 1): frozen_params_def = [ p for E in nli_model.get_def_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params_def) & set(train_params)) > 0 frozen_params += frozen_params_def train_params = [p for p in cg[True].parameters if p not in frozen_params] train_params_keys = [ get_brick(p).get_hierarchical_name(p) for p in train_params ] # Optimizer algorithm = GradientDescent(cost=final_cost, on_unused_sources='ignore', parameters=train_params, step_rule=Adam(learning_rate=c['lr'])) algorithm.add_updates(extra_updates) m = Model(final_cost) parameters = m.get_parameter_dict() # Blocks version mismatch logger.info("Trainable parameters" + "\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(train_params_keys)], width=120)) logger.info("# of parameters {}".format( sum([ np.prod(parameters[key].get_value().shape) for key in sorted(train_params_keys) ]))) ### Monitored args ### train_monitored_vars = [final_cost] + cg[True].outputs monitored_vars = cg[False].outputs val_acc = monitored_vars[1] to_monitor_names = [ 'def_unk_ratio', 's1_merged_input_rootmean2', 's1_def_mean_rootmean2', 's1_gate_rootmean2', 's1_compose_gate_rootmean2' ] for k in to_monitor_names: train_v, valid_v = VariableFilter(name=k)( cg[True]), VariableFilter(name=k)(cg[False]) if len(train_v): logger.info("Adding {} tracking".format(k)) train_monitored_vars.append(train_v[0]) monitored_vars.append(valid_v[0]) else: logger.warning("Didnt find {} in cg".format(k)) if c['monitor_parameters']: for name in train_params_keys: param = parameters[name] num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements grad_norm = algorithm.gradients[param].norm(2) / num_elements step_norm = algorithm.steps[param].norm(2) / num_elements stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' train_monitored_vars.append(stats) regular_training_stream = data.get_stream('train', batch_size=c['batch_size'], seed=seed) if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=regular_training_stream.sources, hwm=100, produces_examples=regular_training_stream.produces_examples) else: training_stream = regular_training_stream ### Build extensions ### extensions = [ # Load(main_loop_path, load_iteration_state=True, load_log=True) # .set_conditions(before_training=not new_training_job), StartFuelServer(regular_training_stream, stream_path, hwm=100, script_path=os.path.join( os.path.dirname(__file__), "../bin/start_fuel_server.py"), before_training=fuel_server), Timing(every_n_batches=c['mon_freq']), ProgressBar(), RetrievalPrintStats(retrieval=used_retrieval, every_n_batches=c['mon_freq_valid'], before_training=not fast_start), Timestamp(), TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq']), ] if c['layout'] == 'snli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid', batch_size=14, seed=seed), before_training=not fast_start, on_resumption=True, after_training=True, every_n_batches=c['mon_freq_valid'], prefix='valid') extensions.append(validation) elif c['layout'] == 'mnli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid_matched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], on_resumption=True, after_training=True, prefix='valid_matched') validation_mismatched = DataStreamMonitoring( monitored_vars, data.get_stream('valid_mismatched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], before_training=not fast_start, on_resumption=True, after_training=True, prefix='valid_mismatched') extensions.extend([validation, validation_mismatched]) else: raise NotImplementedError() # Similarity trackers for embeddings if len(c.get('vocab_def', '')): retrieval_vocab = Vocabulary(c['vocab_def']) else: retrieval_vocab = data.vocab retrieval_all = Retrieval(vocab_text=retrieval_vocab, dictionary=used_dict, max_def_length=c['max_def_length'], exclude_top_k=0, max_def_per_word=c['max_def_per_word']) for name in [ 's1_word_embeddings', 's1_dict_word_embeddings', 's1_translated_word_embeddings' ]: variables = VariableFilter(name=name)(cg[False]) if len(variables): s1_emb = variables[0] logger.info("Adding similarity tracking for " + name) # A bit sloppy about downcast if "dict" in name: embedder = construct_dict_embedder(theano.function( [s1, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True), vocab=data.vocab, retrieval=retrieval_all) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) else: embedder = construct_embedder(theano.function( [s1], s1_emb, allow_input_downcast=True), vocab=data.vocab) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) track_the_best = TrackTheBest(validation.record_name(val_acc), before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start, every_n_batches=c['mon_freq_valid'], choose_best=min) extensions.append(track_the_best) # Special care for serializing embeddings if len(c.get('embedding_path', '')) or len(c.get('embedding_def_path', '')): extensions.insert( 0, LoadNoUnpickling(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=train_params + [p for p, m in pop_updates], save_main_loop=False, save_separately=['log', 'iteration_state'], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) else: extensions.insert( 0, Load(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=cg[True].parameters + [p for p, m in pop_updates], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) extensions.extend([ DumpCSVSummaries(save_path, every_n_batches=c['mon_freq_valid'], after_training=True), DumpTensorflowSummaries(save_path, after_epoch=True, every_n_batches=c['mon_freq_valid'], after_training=True), Printing(every_n_batches=c['mon_freq_valid']), PrintMessage(msg="save_path={}".format(save_path), every_n_batches=c['mon_freq']), FinishAfter(after_n_batches=c['n_batches']).add_condition( ['after_batch'], OnLogStatusExceed('iterations_done', c['n_batches'])) ]) logger.info(extensions) ### Run training ### if "VISDOM_SERVER" in os.environ: print("Running visdom server") ret = subprocess.Popen([ os.path.join(os.path.dirname(__file__), "../visdom_plotter.py"), "--visdom-server={}".format(os.environ['VISDOM_SERVER']), "--folder={}".format(save_path) ]) time.sleep(0.1) if ret.returncode is not None: raise Exception() atexit.register(lambda: os.kill(ret.pid, signal.SIGINT)) model = Model(cost) for p, m in pop_updates: model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p main_loop = MainLoop(algorithm, training_stream, model=model, extensions=extensions) assert os.path.exists(save_path) main_loop.run()
def create_main_loop(save_to, num_epochs, unit_order=None, batch_size=500, num_batches=None): image_size = (28, 28) output_size = 10 convnet = create_lenet_5() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) case_costs = CasewiseCrossEntropy().apply(y.flatten(), probs) cost = case_costs.mean().copy(name='cost') # cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) # .copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) cg = ComputationGraph([cost, error_rate]) # Apply regularization to the cost weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + sum([0.0003 * (W ** 2).sum() for W in weights]) cost.name = 'cost_with_regularization' mnist_train = MNIST(("train",)) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme( mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme( mnist_test.num_examples, batch_size)) # Generate pics for biases biases = VariableFilter(roles=[BIAS])(cg.parameters) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=AdaDelta()) # Find layer outputs to probe outs = OrderedDict(reversed(list((get_brick(out).name, out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])( cg.variables)))) actpic_extension = ActpicExtension( actpic_variables=outs, case_labels=y, pics=x, label_count=output_size, rectify=-1, data_stream=mnist_test_stream, after_batch=True) synpic_extension = SynpicExtension( synpic_parameters=biases, case_costs=case_costs, case_labels=y, pics=x, batch_size=batch_size, pic_size=image_size, label_count=output_size, after_batch=True) # Impose an orderint for the SaveImages extension if unit_order is not None: with open(unit_order, 'rb') as handle: histograms = pickle.load(handle) unit_order = compute_unit_order(histograms) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), actpic_extension, synpic_extension, SaveImages(picsources=[synpic_extension, actpic_extension], title="LeNet-5: batch {i}, " + "cost {cost_with_regularization:.2f}, " + "trainerr {error_rate:.3f}", data=[cost, error_rate], graph='error_rate', graph_len=500, unit_order=unit_order, after_batch=True), DataStreamMonitoring( [cost, error_rate], mnist_test_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing()] model = Model(cost) main_loop = MainLoop( algorithm, mnist_train_stream, model=model, extensions=extensions) return main_loop
def train(config, save_path, bokeh_name, params, bokeh_server, test_tag, use_load_ext, load_log, fast_start, validation_epochs, validation_batches, per_epochs, per_batches): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) # Build the main brick and initialize all parameters. recognizer = SpeechRecognizer( data.recordings_source, data.labels_source, data.eos_label, data.num_features, data.num_labels, name="recognizer", data_prepend_eos=data.prepend_eos, character_map=data.character_map, **config["net"]) for brick_path, attribute_dict in sorted( config['initialization'].items(), key=lambda (k, v): -k.count('/')): for attribute, value in attribute_dict.items(): brick, = Selector(recognizer).select(brick_path).bricks setattr(brick, attribute, value) brick.push_initialization_config() recognizer.initialize() # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) if params: logger.info("Load parameters from " + params) recognizer.load_params(params) if test_tag: tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__ __stream = data.get_stream("train") __data = next(__stream.get_epoch_iterator(as_dict=True)) recognizer.recordings.tag.test_value = __data[data.recordings_source] recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask'] recognizer.labels.tag.test_value = __data[data.labels_source] recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask'] theano.config.compute_test_value = 'warn' batch_cost = recognizer.get_cost_graph().sum() batch_size = named_copy(recognizer.recordings.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output, = VariableFilter( applications=[r.bottom.apply], name="output")( cost_cg) attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = named_copy(r.recordings.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = named_copy(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = named_copy(attended.shape[0], "max_attended_length") max_num_phonemes = named_copy(r.labels.shape[0], "max_num_phonemes") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_attended = named_copy(abs(attended).mean(), "mean_attended") mean_bottom_output = named_copy(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = named_copy(monotonicity_penalty(weights, r.labels_mask), "weights_penalty") weights_entropy = named_copy(entropy(weights, r.labels_mask), "weights_entropy") mask_density = named_copy(r.labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) regularized_cost = regularized_cg.outputs[0] regularized_weights_penalty = regularized_cg.outputs[1] # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(regularized_cost) params = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, params[key].get_value().shape) for key in sorted(params.keys())], width=120)) # Define the training algorithm. train_conf = config['training'] clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False): logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] algorithm = GradientDescent( cost=regularized_cost + reg_config.get("penalty_coof", .0) * regularized_weights_penalty / batch_size + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2, parameters=params.values(), step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)])) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. observables = regularized_cg.outputs observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in params.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(named_copy(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(named_copy(aggregation.mean( var, recognizer.labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( [observables[0], algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes([cost, weights_entropy, weights_penalty]), data.get_stream("valid"), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=validation_epochs, every_n_batches=validation_batches, after_training=False) extensions.append(validation) recognizer.init_beam_search(10) per = PhonemeErrorRate(recognizer, data.get_dataset("valid")) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=per_epochs, every_n_batches=per_batches, after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_likelihood = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_likelihood, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter(data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']) .add_condition(["after_batch"], _gradient_norm_is_none), # Live plotting: requires launching `bokeh-server` # and allows to see what happens online. Plot(bokeh_name if bokeh_name else os.path.basename(save_path), [# Plot 1: training and validation costs [average_monitoring.record_name(regularized_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]], every_n_batches=10, server_url=bokeh_server), Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_likelihood.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar(), Printing(every_n_batches=1, attribute_filter=PrintingFilterList() )] # Save the config into the status log = TrainingLog() log.status['_config'] = repr(config) main_loop = MainLoop( model=model, log=log, algorithm=algorithm, data_stream=data.get_stream("train"), extensions=extensions) main_loop.run()
def __init__(self, save_to): batch_size = 500 image_size = (28, 28) output_size = 10 convnet = create_lenet_5() layers = convnet.layers mnist_test = MNIST(("test", ), sources=['features', 'targets']) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outmap = OrderedDict( (full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables)) # Generate pics for biases biases = VariableFilter(roles=[BIAS])(cg.parameters) # Generate parallel array, in the same order, for outputs outs = [outmap[full_brick_name(get_brick(b))] for b in biases] # Figure work count error_rate = (MisclassificationRate().apply( y.flatten(), probs).copy(name='error_rate')) sensitive_unit_count = (SensitiveUnitCount().apply( y.flatten(), probs, biases).copy(name='sensitive_unit_count')) sensitive_unit_count.tag.aggregation_scheme = ( Concatenate(sensitive_unit_count)) active_unit_count = (ActiveUnitCount().apply(outs).copy( name='active_unit_count')) active_unit_count.tag.aggregation_scheme = ( Concatenate(active_unit_count)) ignored_unit_count = (IgnoredUnitCount().apply( y.flatten(), probs, biases, outs).copy(name='ignored_unit_count')) ignored_unit_count.tag.aggregation_scheme = ( Concatenate(ignored_unit_count)) model = Model([ error_rate, sensitive_unit_count, active_unit_count, ignored_unit_count ]) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, batch_size)) evaluator = DatasetEvaluator([ error_rate, sensitive_unit_count, active_unit_count, ignored_unit_count ]) results = evaluator.evaluate(mnist_test_stream) def save_ranked_image(scores, filename): sorted_instances = scores.argsort() filmstrip = Filmstrip(image_shape=(28, 28), grid_shape=(100, 100)) for i, index in enumerate(sorted_instances): filmstrip.set_image((i // 100, i % 100), mnist_test.get_data(request=index)[0]) filmstrip.save(filename) save_ranked_image(results['sensitive_unit_count'], 'sensitive.jpg') save_ranked_image(results['active_unit_count'], 'active.jpg') save_ranked_image(results['ignored_unit_count'], 'ignored.jpg')
def main(save_to): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) random_init = (numpy.random.rand(100, 1, 28, 28) * 128).astype('float32') layers = [l for l in convnet.layers if isinstance(l, Convolutional)] mnist_test = MNIST(("test",), sources=['features', 'targets']) basis_init = create_fair_basis(mnist_test, 10, 50) basis_set = make_shifted_basis(basis_init, convnet, layers) for layer, basis in zip(layers, basis_set): # basis is 5d: # (probed_units, base_cases, 1-c, 28-y, 28-x) b = shared_floatx(basis) # coefficients is 2d: # (probed_units, base_cases) coefficients = shared_floatx( numpy.ones(basis.shape[0:2], dtype=theano.config.floatX)) # prod is 5d: (probed_units, base_cases, 1-c, 28-y, 28-x) prod = tensor.shape_padright(coefficients, 3) * b # x is 4d: (probed_units, 1-c, 28-y, 28-x) ux = prod.sum(axis=1) x = tensor.clip(ux / tensor.shape_padright(ux.flatten(ndim=2).max(axis=1), 3), 0, 1) # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter( roles=[OUTPUT], bricks=[layer])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) learning_rate = shared_floatx(0.03, 'learning_rate') # We will try to do all units at once. # unit = shared_floatx(0, 'unit', dtype='int64') # But we are only doing one layer at once. output = outs[0] dims = layer.get_dims(['output'])[0] if isinstance(dims, numbers.Integral): # FC case: output is 2d: (probed_units, units) dims = (dims, ) unitrange = tensor.arange(dims[0]) costvec = -tensor.log( tensor.nnet.softmax(output)[unitrange, unitrage]. flatten()) else: # Conv case: output is 4d: (probed_units, units, y, x) unitrange = tensor.arange(dims[0]) print('dims is', dims) costvec = -tensor.log(tensor.nnet.softmax(output[ unitrange, unitrange, dims[1] // 2, dims[2] // 2]). flatten()) cost = costvec.sum() # grad is dims (probed_units, basis_size) grad = gradient.grad(cost, coefficients) stepc = coefficients # - learning_rate * grad newc = stepc / tensor.shape_padright(stepc.mean(axis=1)) fn = theano.function([], [cost, x], updates=[(coefficients, newc)]) filmstrip = Filmstrip( random_init.shape[-2:], (dims[0], 1), background='red') layer = get_brick(output) learning_rate.set_value(0.1) for index in range(20000): c, result = fn() if index % 1000 == 0: learning_rate.set_value(numpy.cast[theano.config.floatX]( learning_rate.get_value() * 0.8)) print('cost', c) for u in range(dims[0]): filmstrip.set_image((u, 0), result[u,:,:,:]) filmstrip.save(layer.name + '_stroke.jpg') for u in range(dims[0]): filmstrip.set_image((u, 0), result[u,:,:,:]) filmstrip.save(layer.name + '_stroke.jpg')
def main(save_to, hist_file): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) mnist_test = MNIST(("test",), sources=['features', 'targets']) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) confusion = (ConfusionMatrix().apply(y.flatten(), probs) .copy(name='confusion')) confusion.tag.aggregation_scheme = Sum(confusion) model = Model([error_rate, confusion]) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) def full_brick_name(brick): return '/'.join([''] + [b.name for b in brick.get_unique_path()]) # Find layer outputs to probe outs = OrderedDict((full_brick_name(get_brick(out)), out) for out in VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])( model.variables)) # Load histogram information with open(hist_file, 'rb') as handle: histograms = pickle.load(handle) # Corpora mnist_train = MNIST(("train",)) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme( mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme( mnist_test.num_examples, batch_size)) # Probe the given layer target_layer = '/lenet/mlp/linear_0' next_layer_param = '/lenet/mlp/linear_1.W' sample = extract_sample(outs[target_layer], mnist_test_stream) print('sample shape', sample.shape) # Figure neurons to ablate hist = histograms[('linear_1', 'b')] targets = [i for i in range(hist.shape[1]) if hist[2, i] * hist[7, i] < 0] print('ablating', len(targets), ':', targets) # Now adjust the next layer weights based on the probe param = model.get_parameter_dict()[next_layer_param] print('param shape', param.get_value().shape) new_weights = ablate_inputs( targets, sample, param.get_value(), compensate=False) param.set_value(new_weights) # Evaluation pass evaluator = DatasetEvaluator([error_rate, confusion]) print(evaluator.evaluate(mnist_test_stream))
def main(save_to): batch_size = 365 feature_maps = [6, 16] mlp_hiddens = [120, 84] conv_sizes = [5, 5] pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # The above are from LeCun's paper. The blocks example had: # feature_maps = [20, 50] # mlp_hiddens = [500] # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='valid', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() layers = convnet.layers logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) random_init = (numpy.random.rand(100, 1, 28, 28) * 128).astype('float32') mnist_test = MNIST(("test",), sources=['features', 'targets']) basis = create_fair_basis(mnist_test, 10, 10) # state = mnist_test.open() # # basis = numpy.zeros((100, 1, 28, 28), dtype=theano.config.floatX) # counters = [0] * 10 # index = 0 # while min(counters) < 10: # feature, target = mnist_test.get_data(state=state, request=[index]) # target = target[0, 0] # feature = feature / 256 # if counters[target] < 10: # basis[target + counters[target] * 10, :, :, :] = feature[0, :, :, :] # counters[target] += 1 # index += 1 # mnist_test.close(state=state) # b = shared_floatx(basis) # random_init = numpy.rand.random(100, 1000) # r = shared_floatx(random_init) # rn = r / r.norm(axis=1) # x = tensor.dot(rn, tensor.shape_padright(b)) x = tensor.tensor4('features') # Normalize input and apply the convnet probs = convnet.apply(x) cg = ComputationGraph([probs]) outs = VariableFilter( roles=[OUTPUT], bricks=[Convolutional, Linear])(cg.variables) # Create an interior activation model model = Model([probs] + outs) # Load it with trained parameters params = load_parameters(open(save_to, 'rb')) model.set_parameter_values(params) fn = theano.function([x], outs) results = fn(basis) for snapshots, output in zip(results, outs): layer = get_brick(output) filmstrip = Filmstrip( basis.shape[-2:], (snapshots.shape[1], snapshots.shape[0]), background='purple') if layer in layers: fieldmap = layerarray_fieldmap(layers[0:layers.index(layer) + 1]) for unit in range(snapshots.shape[1]): for index in range(snapshots.shape[0]): mask = make_mask(basis.shape[-2:], fieldmap, numpy.clip( snapshots[index, unit, :, :], 0, numpy.inf)) filmstrip.set_image((unit, index), basis[index, :, :, :], mask) filmstrip.save(layer.name + '_show.jpg')