class ProjectionLayer(Layer): """ This layer can be used to project discrete labels into a continous space as done in e.g. language models. It takes labels as an input (IndexSpace) and maps them to their continous embeddings and concatenates them. Parameters ---------- dim : int The dimension of the embeddings. Note that this means that the output dimension is (dim * number of input labels) layer_name : string Layer name irange : numeric The range of the uniform distribution used to initialize the embeddings. Can't be used with istdev. istdev : numeric The standard deviation of the normal distribution used to initialize the embeddings. Can't be used with irange. """ def __init__(self, dim, layer_name, irange=None, istdev=None): """ Initializes a projection layer. """ super(ProjectionLayer, self).__init__() self.dim = dim self.layer_name = layer_name if irange is None and istdev is None: raise ValueError("ProjectionLayer needs either irange or" "istdev in order to intitalize the projections.") elif irange is not None and istdev is not None: raise ValueError("ProjectionLayer was passed both irange and " "istdev but needs only one") else: self._irange = irange self._istdev = istdev @wraps(Layer.get_monitoring_channels) def get_monitoring_channels(self): W, = self.transformer.get_params() assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) return OrderedDict([('row_norms_min', row_norms.min()), ('row_norms_mean', row_norms.mean()), ('row_norms_max', row_norms.max()), ('col_norms_min', col_norms.min()), ('col_norms_mean', col_norms.mean()), ('col_norms_max', col_norms.max()), ]) @wraps(Layer.set_input_space) def set_input_space(self, space): if isinstance(space, IndexSpace): self.input_dim = space.dim self.input_space = space else: raise ValueError("ProjectionLayer needs an IndexSpace as input") self.output_space = VectorSpace(self.dim * self.input_dim) rng = self.mlp.rng if self._irange is not None: W = rng.uniform(-self._irange, self._irange, (space.max_labels, self.dim)) else: W = rng.randn(space.max_labels, self.dim) * self._istdev W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W, = self.transformer.get_params() assert W.name is not None @wraps(Layer.fprop) def fprop(self, state_below): z = self.transformer.project(state_below) return z @wraps(Layer.get_params) def get_params(self): W, = self.transformer.get_params() assert W.name is not None params = [W] return params
class vLBLSoft(Model): def __init__(self, dict_size, dim, context_length, k, irange = 0.1, seed = 22): super(vLBLSoft, self).__init__() rng = np.random.RandomState(seed) self.rng = rng self.context_length = context_length self.dim = dim self.dict_size = dict_size C_values = np.asarray(rng.normal(0, math.sqrt(irange), size=(dim,context_length)), dtype=theano.config.floatX) self.C = theano.shared(value=C_values, name='C', borrow=True) W_context = rng.uniform(-irange, irange, (dict_size, dim)) W_context = sharedX(W_context,name='W_context') W_target = rng.uniform(-irange, irange, (dict_size, dim)) W_target = sharedX(W_target,name='W_target') self.projector_context = MatrixMul(W_context) self.projector_target = MatrixMul(W_target) self.W_context = W_context self.W_target = W_target self.W_target = W_context b_values = np.asarray(rng.normal(0, math.sqrt(irange), size=(dict_size,)), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, name='b', borrow=True) self.input_space = IndexSpace(dim = context_length, max_labels = dict_size) self.output_space = IndexSpace(dim = 1, max_labels = dict_size) self.allY = T.as_tensor_variable(np.arange(dict_size,dtype=np.int64).reshape(dict_size,1)) def get_params(self): #get W from projector rval1 = self.projector_context.get_params() rval2 = self.projector_target.get_params() #add C, b rval1.extend([self.C, self.b]) rval1.extend(rval2) return rval1 def fprop(self, state_below): """ state_below is r_w? """ state_below = state_below.reshape((state_below.shape[0], self.dim, self.context_length)) rval = self.C.dimshuffle('x', 0, 1) * state_below rval = rval.sum(axis=2) return rval def get_default_cost(self): return Default() def get_monitoring_data_specs(self): """ Returns data specs requiring both inputs and targets.""" space = CompositeSpace((self.get_input_space(), self.get_output_space())) source = (self.get_input_source(), self.get_target_source()) return (space, source) def get_monitoring_channels(self, data): rval = OrderedDict() rval['nll'] = self.cost_from_X(data) rval['perplexity'] = 10 ** (rval['nll']/np.log(10).astype('float32')) return rval def score(self, all_q_w, q_h): sallwh = T.dot(q_h,all_q_w.T) + self.b.dimshuffle('x',0) return sallwh def cost_from_X(self, data): X, Y = data X = self.projector_context.project(X) q_h = self.fprop(X) rval = self.cost(Y,q_h) return rval def cost(self,Y,q_h): all_q_w = self.W_target s = self.score(all_q_w,q_h) p_w_given_h = T.nnet.softmax(s) return T.cast(-T.mean(T.diag(T.log(p_w_given_h)[T.arange(Y.shape[0]), Y])),theano.config.floatX) def apply_dropout(self, state, include_prob, scale, theano_rng, input_space, mask_value=0, per_example=True): """ per_example : bool, optional Sample a different mask value for every example in a batch. Defaults to `True`. If `False`, sample one mask per mini-batch. """ if include_prob in [None, 1.0, 1]: return state assert scale is not None if isinstance(state, tuple): return tuple(self.apply_dropout(substate, include_prob, scale, theano_rng, mask_value) for substate in state) if per_example: mask = theano_rng.binomial(p=include_prob, size=state.shape, dtype=state.dtype) else: batch = input_space.get_origin_batch(1) mask = theano_rng.binomial(p=include_prob, size=batch.shape, dtype=state.dtype) rebroadcast = T.Rebroadcast(*zip(xrange(batch.ndim), [s == 1 for s in batch.shape])) mask = rebroadcast(mask) if mask_value == 0: rval = state * mask * scale else: rval = T.switch(mask, state * scale, mask_value) return T.cast(rval, state.dtype) def dropout_fprop(self, state_below, default_input_include_prob=0.5, input_include_probs=None, default_input_scale=2., input_scales=None, per_example=True): if input_include_probs is None: input_include_probs = {} if input_scales is None: input_scales = {} theano_rng = MRG_RandomStreams(max(self.rng.randint(2 ** 15), 1)) include_prob = default_input_include_prob scale = default_input_scale state_below = self.apply_dropout( state=state_below, include_prob=include_prob, theano_rng=theano_rng, scale=scale, #check mask_value=0, input_space=self.get_input_space(), per_example=per_example ) state_below = self.fprop(state_below) return state_below
class ProjectionLayer(Layer): """ This layer can be used to project discrete labels into a continous space as done in e.g. language models. It takes labels as an input (IndexSpace) and maps them to their continous embeddings and concatenates them. Parameters ---------- dim : int The dimension of the embeddings. Note that this means that the output dimension is (dim * number of input labels) layer_name : string Layer name irange : numeric The range of the uniform distribution used to initialize the embeddings. Can't be used with istdev. istdev : numeric The standard deviation of the normal distribution used to initialize the embeddings. Can't be used with irange. """ def __init__(self, dim, layer_name, irange=None, istdev=None): """ Initializes a projection layer. """ super(ProjectionLayer, self).__init__() self.dim = dim self.layer_name = layer_name if irange is None and istdev is None: raise ValueError("ProjectionLayer needs either irange or" "istdev in order to intitalize the projections.") elif irange is not None and istdev is not None: raise ValueError("ProjectionLayer was passed both irange and " "istdev but needs only one") else: self._irange = irange self._istdev = istdev @wraps(Layer.get_layer_monitoring_channels) def get_layer_monitoring_channels(self, *args, **kwargs): W, = self.transformer.get_params() assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) return OrderedDict([ ('row_norms_min', row_norms.min()), ('row_norms_mean', row_norms.mean()), ('row_norms_max', row_norms.max()), ('col_norms_min', col_norms.min()), ('col_norms_mean', col_norms.mean()), ('col_norms_max', col_norms.max()), ]) def _check_input_space_and_get_max_labels(self, space): if isinstance(space, IndexSpace): return space.max_labels if isinstance(space, CompositeSpace): ml = [] for c in space.components: ml.append(self._check_input_space_and_get_max_labels(c)) # check that all of them are equal if len(set(ml)) != 1: raise ValueError("Composite space is empty or containing " "incompatible index spaces") return ml[0] raise ValueError("ProjectionLayer needs an IndexSpace or a " "CompositeSpace of them as input") def _build_output_space(self, space): if isinstance(space, IndexSpace): return VectorSpace(self.dim * space.dim) if isinstance(space, CompositeSpace): return CompositeSpace( [self._build_output_space(c) for c in space.components]) assert False @wraps(Layer.set_input_space) def set_input_space(self, space): max_labels = self._check_input_space_and_get_max_labels(space) self.input_space = space self.output_space = self._build_output_space(space) rng = self.mlp.rng if self._irange is not None: W = rng.uniform(-self._irange, self._irange, (max_labels, self.dim)) else: W = rng.randn(max_labels, self.dim) * self._istdev W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W, = self.transformer.get_params() assert W.name is not None def _fprop_recursive(self, state_below): if isinstance(state_below, tuple): return tuple(self._fprop_recursive(s) for s in state_below) return self.transformer.project(state_below) @wraps(Layer.fprop) def fprop(self, state_below): return self._fprop_recursive(state_below) @wraps(Layer.get_params) def get_params(self): W, = self.transformer.get_params() assert W.name is not None params = [W] return params @wraps(Layer.get_weight_decay) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W, = self.transformer.get_params() return coeff * T.sqr(W).sum() @wraps(Layer.get_l1_weight_decay) def get_l1_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W, = self.transformer.get_params() return coeff * abs(W).sum()
class vLBL(Model): def __init__(self, dict_size, dim, context_length, k, irange = 0.1, seed = 22, max_row_norm=None,max_col_norm=None): rng = np.random.RandomState(seed) self.rng = rng self.k = k self.context_length = context_length self.dim = dim self.dict_size = dict_size C = rng.randn(dim, context_length) self.C = sharedX(C) W = rng.uniform(-irange, irange, (dict_size, dim)) W = sharedX(W) self.W=W # TODO maybe have another projector for tagets self.projector = MatrixMul(W) self.b = sharedX(np.zeros((dict_size,)), name = 'vLBL_b') self.input_space = IndexSpace(dim = context_length, max_labels = dict_size) #self.output_space = IndexSpace(dim = 1, max_labels = dict_size) self.output_space = VectorSpace(dim = dict_size) self.max_row_norm=max_row_norm self.max_col_norm=max_col_norm def get_params(self): rval = self.projector.get_params() rval.extend([self.C, self.b]) return rval def get_default_cost(self): return Default() def fprop(self, state_below): "q^(h) from EQ. 2" state_below = state_below.reshape((state_below.shape[0], self.dim, self.context_length)) rval = self.C.dimshuffle('x', 0, 1) * state_below rval = rval.sum(axis=2) return rval def _modify_updates(self,updates): #for param in self.get_params(): if self.max_row_norm is not None: W = self.W if W in updates: updated_W = updates[W] row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1)) desired_norms = T.clip(row_norms, 0, self.max_row_norm) scales = desired_norms / (1e-7 + row_norms) updates[W] = updated_W * scales.dimshuffle(0, 'x') if self.max_col_norm is not None: assert self.max_row_norm is None W = self.W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) def score(self,q_h): q_w = self.projector._W rval = T.dot(q_h, q_w.T) + self.b.dimshuffle('x', 0) return rval def cost(self,Y,q_h): z = self.score(q_h) z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x')) log_prob_of = (Y * log_prob).sum(axis=1) assert log_prob_of.ndim == 1 rval = as_floatX(log_prob_of.mean()) return - rval def cost_from_X(self, data): X, Y = data X = self.projector.project(X) q_h = self.fprop(X) return self.cost(Y,q_h) def get_monitoring_data_specs(self): space = CompositeSpace((self.get_input_space(), self.get_output_space())) source = (self.get_input_source(), self.get_target_source()) return (space, source) def get_monitoring_channels(self, data): X, Y = data rval = OrderedDict() W_context = self.W W_target = self.W b = self.b C = self.C sq_W_context = T.sqr(W_context) # sq_W_target = T.sqr(W_target) sq_b = T.sqr(b) sq_c = T.sqr(C) row_norms_W_context = T.sqrt(sq_W_context.sum(axis=1)) col_norms_W_context = T.sqrt(sq_W_context.sum(axis=0)) # row_norms_W_target = T.sqrt(sq_W_target.sum(axis=1)) # col_norms_W_target = T.sqrt(sq_W_target.sum(axis=0)) col_norms_b = T.sqrt(sq_b.sum(axis=0)) col_norms_c = T.sqrt(sq_c.sum(axis=0)) rval = OrderedDict([ ('W_context_row_norms_min' , row_norms_W_context.min()), ('W_context_row_norms_mean' , row_norms_W_context.mean()), ('W_context_row_norms_max' , row_norms_W_context.max()), ('W_context_col_norms_min' , col_norms_W_context.min()), ('W_context_col_norms_mean' , col_norms_W_context.mean()), ('W_context_col_norms_max' , col_norms_W_context.max()), # ('W_target_row_norms_min' , row_norms_W_target.min()), # ('W_target_row_norms_mean' , row_norms_W_target.mean()), # ('W_target_row_norms_max' , row_norms_W_target.max()), # ('W_target_col_norms_min' , col_norms_W_target.min()), # ('W_target_col_norms_mean' , col_norms_W_target.mean()), # ('W_target_col_norms_max' , col_norms_W_target.max()), ('b_col_norms_min' , col_norms_b.min()), ('b_col_norms_mean' , col_norms_b.mean()), ('b_col_norms_max' , col_norms_b.max()), ('c_col_norms_min' , col_norms_c.min()), ('c_col_norms_mean' , col_norms_c.mean()), ('c_col_norms_max' , col_norms_c.max()), ]) nll = self.cost_from_X(data) rval['perplexity'] = as_floatX(10 ** (nll/np.log(10))) return rval def apply_dropout(self, state, include_prob, scale, theano_rng, input_space, mask_value=0, per_example=True): """ per_example : bool, optional Sample a different mask value for every example in a batch. Defaults to `True`. If `False`, sample one mask per mini-batch. """ if include_prob in [None, 1.0, 1]: return state assert scale is not None if isinstance(state, tuple): return tuple(self.apply_dropout(substate, include_prob, scale, theano_rng, mask_value) for substate in state) if per_example: mask = theano_rng.binomial(p=include_prob, size=state.shape, dtype=state.dtype) else: batch = input_space.get_origin_batch(1) mask = theano_rng.binomial(p=include_prob, size=batch.shape, dtype=state.dtype) rebroadcast = T.Rebroadcast(*zip(xrange(batch.ndim), [s == 1 for s in batch.shape])) mask = rebroadcast(mask) if mask_value == 0: rval = state * mask * scale else: rval = T.switch(mask, state * scale, mask_value) return T.cast(rval, state.dtype) def dropout_fprop(self, state_below, default_input_include_prob=0.5, input_include_probs=None, default_input_scale=2., input_scales=None, per_example=True): if input_include_probs is None: input_include_probs = {} if input_scales is None: input_scales = {} theano_rng = MRG_RandomStreams(max(self.rng.randint(2 ** 15), 1)) include_prob = default_input_include_prob scale = default_input_scale state_below = self.apply_dropout( state=state_below, include_prob=include_prob, theano_rng=theano_rng, scale=scale, #check mask_value=0, input_space=self.get_input_space(), per_example=per_example ) state_below = self.fprop(state_below) return state_below
class vLBL(Model): def __init__(self, dict_size, dim, context_length, k, irange = 0.1, seed = 22): rng = np.random.RandomState(seed) self.rng = rng self.k = k self.context_length = context_length self.dim = dim self.dict_size = dict_size C = rng.randn(dim, context_length) self.C = sharedX(C) W = rng.uniform(-irange, irange, (dict_size, dim)) W = sharedX(W) # TODO maybe have another projector for tagets self.projector = MatrixMul(W) self.b = sharedX(np.zeros((dict_size,)), name = 'vLBL_b') self.set_spaces() self.rng = np.random.RandomState(2014) def set_spaces(self): self.input_space = IndexSpace(dim=self.context_length, max_labels=self.dict_size) self.output_space = VectorSpace(dim=self.dict_size) def get_params(self): rval = self.projector.get_params() rval.extend([self.C, self.b]) return rval def context(self, state_below): "q^(h) from EQ. 2" state_below = state_below.reshape((state_below.shape[0], self.dim, self.context_length)) rval = self.C.dimshuffle('x', 0, 1) * state_below rval = rval.sum(axis=2) return rval def score(self, X, Y=None): X = self.projector.project(X) q_h = self.context(X) # this is used during training if Y is not None: q_w = self.projector.project(Y).reshape((Y.shape[0], self.dim)) rval = (q_w * q_h).sum(axis=1) + self.b[Y].flatten() # during nll else: q_w = self.projector._W rval = T.dot(q_h, q_w.T) + self.b.dimshuffle('x', 0) return rval def cost_from_X(self, data): X, Y = data z = self.score(X) z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x')) log_prob_of = (Y * log_prob).sum(axis=1) assert log_prob_of.ndim == 1 rval = as_floatX(log_prob_of.mean()) return - rval def get_monitoring_data_specs(self): space = CompositeSpace((self.get_input_space(), self.get_output_space())) source = (self.get_input_source(), self.get_target_source()) return (space, source) def get_monitoring_channels(self, data): X, Y = data rval = OrderedDict() nll = self.cost_from_X(data) rval['perplexity'] = as_floatX(10 ** (nll/np.log(10))) return rval
class ProjectionLayer(Layer): """ This layer can be used to project discrete labels into a continous space as done in e.g. language models. It takes labels as an input (IndexSpace) and maps them to their continous embeddings and concatenates them. Parameters ---------- dim : int The dimension of the embeddings. Note that this means that the output dimension is (dim * number of input labels) layer_name : string Layer name irange : numeric The range of the uniform distribution used to initialize the embeddings. Can't be used with istdev. istdev : numeric The standard deviation of the normal distribution used to initialize the embeddings. Can't be used with irange. """ def __init__(self, dim, layer_name, irange=None, istdev=None): """ Initializes a projection layer. """ super(ProjectionLayer, self).__init__() self.dim = dim self.layer_name = layer_name if irange is None and istdev is None: raise ValueError("ProjectionLayer needs either irange or" "istdev in order to intitalize the projections.") elif irange is not None and istdev is not None: raise ValueError("ProjectionLayer was passed both irange and " "istdev but needs only one") else: self._irange = irange self._istdev = istdev @wraps(Layer.get_monitoring_channels) def get_monitoring_channels(self): W, = self.transformer.get_params() assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) return OrderedDict([ ('row_norms_min', row_norms.min()), ('row_norms_mean', row_norms.mean()), ('row_norms_max', row_norms.max()), ('col_norms_min', col_norms.min()), ('col_norms_mean', col_norms.mean()), ('col_norms_max', col_norms.max()), ]) @wraps(Layer.set_input_space) def set_input_space(self, space): if isinstance(space, IndexSpace): self.input_dim = space.dim self.input_space = space else: raise ValueError("ProjectionLayer needs an IndexSpace as input") self.output_space = VectorSpace(self.dim * self.input_dim) rng = self.mlp.rng if self._irange is not None: W = rng.uniform(-self._irange, self._irange, (space.max_labels, self.dim)) else: W = rng.randn(space.max_labels, self.dim) * self._istdev W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W, = self.transformer.get_params() assert W.name is not None @wraps(Layer.fprop) def fprop(self, state_below): z = self.transformer.project(state_below) return z @wraps(Layer.get_params) def get_params(self): W, = self.transformer.get_params() assert W.name is not None params = [W] return params
class vLBLSoft(Model): def __init__(self, dict_size, dim, context_length, k, irange=0.1, seed=22): super(vLBLSoft, self).__init__() rng = np.random.RandomState(seed) self.rng = rng self.context_length = context_length self.dim = dim self.dict_size = dict_size C_values = np.asarray(rng.normal(0, math.sqrt(irange), size=(dim, context_length)), dtype=theano.config.floatX) self.C = theano.shared(value=C_values, name="C", borrow=True) W_context = rng.uniform(-irange, irange, (dict_size, dim)) W_context = sharedX(W_context, name="W_context") W_target = rng.uniform(-irange, irange, (dict_size, dim)) W_target = sharedX(W_target, name="W_target") self.projector_context = MatrixMul(W_context) self.projector_target = MatrixMul(W_target) self.W_context = W_context self.W_target = W_target self.W_target = W_context b_values = np.asarray(rng.normal(0, math.sqrt(irange), size=(dict_size,)), dtype=theano.config.floatX) self.b = theano.shared(value=b_values, name="b", borrow=True) self.input_space = IndexSpace(dim=context_length, max_labels=dict_size) self.output_space = IndexSpace(dim=1, max_labels=dict_size) self.allY = T.as_tensor_variable(np.arange(dict_size, dtype=np.int64).reshape(dict_size, 1)) def get_params(self): # get W from projector rval1 = self.projector_context.get_params() rval2 = self.projector_target.get_params() # add C, b rval1.extend([self.C, self.b]) rval1.extend(rval2) return rval1 def fprop(self, state_below): """ state_below is r_w? """ state_below = state_below.reshape((state_below.shape[0], self.dim, self.context_length)) rval = self.C.dimshuffle("x", 0, 1) * state_below rval = rval.sum(axis=2) return rval def get_default_cost(self): return Default() def get_monitoring_data_specs(self): """ Returns data specs requiring both inputs and targets. Returns ------- data_specs: TODO The data specifications for both inputs and targets. """ space = CompositeSpace((self.get_input_space(), self.get_output_space())) source = (self.get_input_source(), self.get_target_source()) return (space, source) def get_monitoring_channels(self, data): rval = OrderedDict() # W_context = self.W_context # W_target = self.W_target # b = self.b # C = self.C # sq_W_context = T.sqr(W_context) # sq_W_target = T.sqr(W_target) # sq_b = T.sqr(b) # sq_c = T.sqr(C) # row_norms_W_context = T.sqrt(sq_W_context.sum(axis=1)) # col_norms_W_context = T.sqrt(sq_W_context.sum(axis=0)) # row_norms_W_target = T.sqrt(sq_W_target.sum(axis=1)) # col_norms_W_target = T.sqrt(sq_W_target.sum(axis=0)) # col_norms_b = T.sqrt(sq_b.sum(axis=0)) # col_norms_c = T.sqrt(sq_c.sum(axis=0)) # rval = OrderedDict([ # ('W_context_row_norms_min' , row_norms_W_context.min()), # ('W_context_row_norms_mean' , row_norms_W_context.mean()), # ('W_context_row_norms_max' , row_norms_W_context.max()), # ('W_context_col_norms_min' , col_norms_W_context.min()), # ('W_context_col_norms_mean' , col_norms_W_context.mean()), # ('W_context_col_norms_max' , col_norms_W_context.max()), # ('W_target_row_norms_min' , row_norms_W_target.min()), # ('W_target_row_norms_mean' , row_norms_W_target.mean()), # ('W_target_row_norms_max' , row_norms_W_target.max()), # ('W_target_col_norms_min' , col_norms_W_target.min()), # ('W_target_col_norms_mean' , col_norms_W_target.mean()), # ('W_target_col_norms_max' , col_norms_W_target.max()), # ('b_col_norms_min' , col_norms_b.min()), # ('b_col_norms_mean' , col_norms_b.mean()), # ('b_col_norms_max' , col_norms_b.max()), # ('c_col_norms_min' , col_norms_c.min()), # ('c_col_norms_mean' , col_norms_c.mean()), # ('c_col_norms_max' , col_norms_c.max()), # ]) rval["nll"] = self.cost_from_X(data) rval["perplexity"] = 10 ** (rval["nll"] / np.log(10).astype("float32")) return rval def score(self, all_q_w, q_h): sallwh = T.dot(q_h, all_q_w.T) + self.b.dimshuffle("x", 0) return sallwh def cost_from_X(self, data): X, Y = data X = self.projector_context.project(X) q_h = self.fprop(X) rval = self.cost(Y, q_h) return rval def cost(self, Y, q_h): all_q_w = self.projector_target.project(self.allY) s = self.score(all_q_w, q_h) p_w_given_h = T.nnet.softmax(s) return T.cast(-T.mean(T.diag(T.log(p_w_given_h)[T.arange(Y.shape[0]), Y])), theano.config.floatX) def apply_dropout(self, state, include_prob, scale, theano_rng, input_space, mask_value=0, per_example=True): """ Parameters ---------- state: WRITEME include_prob : WRITEME scale : WRITEME theano_rng : WRITEME input_space : WRITEME mask_value : WRITEME per_example : bool, optional Sample a different mask value for every example in a batch. Defaults to `True`. If `False`, sample one mask per mini-batch. """ if include_prob in [None, 1.0, 1]: return state assert scale is not None if isinstance(state, tuple): return tuple( self.apply_dropout(substate, include_prob, scale, theano_rng, mask_value) for substate in state ) # TODO: all of this assumes that if it's not a tuple, it's # a dense tensor. It hasn't been tested with sparse types. # A method to format the mask (or any other values) as # the given symbolic type should be added to the Spaces # interface. if per_example: mask = theano_rng.binomial(p=include_prob, size=state.shape, dtype=state.dtype) else: batch = input_space.get_origin_batch(1) mask = theano_rng.binomial(p=include_prob, size=batch.shape, dtype=state.dtype) rebroadcast = T.Rebroadcast(*zip(xrange(batch.ndim), [s == 1 for s in batch.shape])) mask = rebroadcast(mask) if mask_value == 0: rval = state * mask * scale else: rval = T.switch(mask, state * scale, mask_value) return T.cast(rval, state.dtype) def dropout_fprop( self, state_below, default_input_include_prob=0.5, input_include_probs=None, default_input_scale=2.0, input_scales=None, per_example=True, ): """ Returns the output of the MLP, when applying dropout to the input and intermediate layers. Parameters ---------- state_below : WRITEME The input to the MLP default_input_include_prob : WRITEME input_include_probs : WRITEME default_input_scale : WRITEME input_scales : WRITEME per_example : bool, optional Sample a different mask value for every example in a batch. Defaults to `True`. If `False`, sample one mask per mini-batch. Notes ----- Each input to each layer is randomly included or excluded for each example. The probability of inclusion is independent for each input and each example. Each layer uses `default_input_include_prob` unless that layer's name appears as a key in input_include_probs, in which case the input inclusion probability is given by the corresponding value. Each feature is also multiplied by a scale factor. The scale factor for each layer's input scale is determined by the same scheme as the input probabilities. """ warnings.warn( "dropout doesn't use fixed_var_descr so it won't work " "with algorithms that make more than one theano " "function call per batch, such as BGD. Implementing " "fixed_var descr could increase the memory usage " "though." ) if input_include_probs is None: input_include_probs = {} if input_scales is None: input_scales = {} # self._validate_layer_names(list(input_include_probs.keys())) # self._validate_layer_names(list(input_scales.keys())) theano_rng = MRG_RandomStreams(max(self.rng.randint(2 ** 15), 1)) # for layer in self.layers: # layer_name = layer.layer_name # if layer_name in input_include_probs: # include_prob = input_include_probs[layer_name] # else: include_prob = default_input_include_prob # if layer_name in input_scales: # scale = input_scales[layer_name] # else: scale = default_input_scale state_below = self.apply_dropout( state=state_below, include_prob=include_prob, theano_rng=theano_rng, scale=scale, # check mask_value=0, input_space=self.get_input_space(), per_example=per_example, ) state_below = self.fprop(state_below) return state_below
class ProjectionLayer(Layer): """ This layer can be used to project discrete labels into a continous space as done in e.g. language models. It takes labels as an input (IndexSpace) and maps them to their continous embeddings and concatenates them. Parameters ---------- dim : int The dimension of the embeddings. Note that this means that the output dimension is (dim * number of input labels) layer_name : string Layer name irange : numeric The range of the uniform distribution used to initialize the embeddings. Can't be used with istdev. istdev : numeric The standard deviation of the normal distribution used to initialize the embeddings. Can't be used with irange. """ def __init__(self, dim, layer_name, irange=None, istdev=None): """ Initializes a projection layer. """ super(ProjectionLayer, self).__init__() self.dim = dim self.layer_name = layer_name if irange is None and istdev is None: raise ValueError("ProjectionLayer needs either irange or" "istdev in order to intitalize the projections.") elif irange is not None and istdev is not None: raise ValueError("ProjectionLayer was passed both irange and " "istdev but needs only one") else: self._irange = irange self._istdev = istdev @wraps(Layer.get_layer_monitoring_channels) def get_layer_monitoring_channels(self, *args, **kwargs): W, = self.transformer.get_params() assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) return OrderedDict([('row_norms_min', row_norms.min()), ('row_norms_mean', row_norms.mean()), ('row_norms_max', row_norms.max()), ('col_norms_min', col_norms.min()), ('col_norms_mean', col_norms.mean()), ('col_norms_max', col_norms.max()), ]) def _check_input_space_and_get_max_labels(self, space): if isinstance(space, IndexSpace): return space.max_labels if isinstance(space, CompositeSpace): ml = [] for c in space.components: ml.append(self._check_input_space_and_get_max_labels(c)) # check that all of them are equal if len(set(ml)) != 1: raise ValueError("Composite space is empty or containing " "incompatible index spaces") return ml[0] raise ValueError("ProjectionLayer needs an IndexSpace or a " "CompositeSpace of them as input") def _build_output_space(self, space): if isinstance(space, IndexSpace): return VectorSpace(self.dim * space.dim) if isinstance(space, CompositeSpace): return CompositeSpace([self._build_output_space(c) for c in space.components]) assert False @wraps(Layer.set_input_space) def set_input_space(self, space): max_labels = self._check_input_space_and_get_max_labels(space) self.input_space = space self.output_space = self._build_output_space(space) rng = self.mlp.rng if self._irange is not None: W = rng.uniform(-self._irange, self._irange, (max_labels, self.dim)) else: W = rng.randn(max_labels, self.dim) * self._istdev W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W, = self.transformer.get_params() assert W.name is not None def _fprop_recursive(self, state_below): if isinstance(state_below, tuple): return tuple(self._fprop_recursive(s) for s in state_below) return self.transformer.project(state_below) @wraps(Layer.fprop) def fprop(self, state_below): return self._fprop_recursive(state_below) @wraps(Layer.get_params) def get_params(self): W, = self.transformer.get_params() assert W.name is not None params = [W] return params @wraps(Layer.get_weight_decay) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W, = self.transformer.get_params() return coeff * T.sqr(W).sum() @wraps(Layer.get_l1_weight_decay) def get_l1_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W, = self.transformer.get_params() return coeff * abs(W).sum()