def softmax(z): assert z.ndim >= 1 if z.ndim <= 2: return T.nnet.softmax(z) else: from TheanoUtil import time_batch_make_flat z_flat = time_batch_make_flat(z) assert z_flat.ndim == 2 return T.reshape(T.nnet.softmax(z_flat), z.shape)
def add_layer(self, layer): """ :type layer: NetworkHiddenLayer.Layer :rtype NetworkHiddenLayer.Layer """ assert layer.name layer_errors = layer.errors() if isinstance(layer, OutputLayer) or layer.name == "output" or layer_errors is not None: is_output_layer = True self.output[layer.name] = layer else: is_output_layer = False self.hidden[layer.name] = layer self.add_cost_and_constraints(layer) if layer_errors is not None: self.errors[layer.name] = layer_errors if is_output_layer: if getattr(layer, "p_y_given_x", None) is None and layer.output: # Small little hack for layers which we use as output-layers which don't set this. from TheanoUtil import time_batch_make_flat layer.p_y_given_x = time_batch_make_flat(layer.output) self.declare_train_params() return layer
def add_layer(self, layer): """ :type layer: NetworkHiddenLayer.Layer :rtype NetworkHiddenLayer.Layer """ assert layer.name layer_errors = layer.errors() if isinstance(layer, OutputLayer) or layer.name == "output" or layer_errors is not None: is_output_layer = True self.output[layer.name] = layer else: is_output_layer = False self.hidden[layer.name] = layer if layer_errors is not None: self.errors[layer.name] = layer_errors if is_output_layer: if getattr(layer, "p_y_given_x", None) is None and layer.output: # Small little hack for layers which we use as output-layers whicgh don't set this. from TheanoUtil import time_batch_make_flat layer.p_y_given_x = layer.output layer.p_y_given_x_flat = time_batch_make_flat(layer.output) self.declare_train_params() return layer
def __init__(self, sources, n_out, index, y_in=None, target=None, target_index=None, sparse=False, cost_scale=1.0, L1=0.0, L2=0.0, L2_eye=None, varreg=0.0, output_L2_reg=0.0, output_entropy_reg=0.0, output_entropy_exp_reg=0.0, with_bias=True, mask="unity", dropout=0.0, batch_norm=False, layer_drop=0.0, residual=False, carry=False, sparse_filtering=False, gradient_scale=1.0, trainable=True, device=None, dtype='float32', **kwargs): """ :param list[NetworkBaseLayer.Layer] sources: list of source layers :param int n_out: output dim of W_in and dim of bias :param float L1: l1-param-norm regularization :param float L2: l2-param-norm regularization :param str mask: "unity" or "dropout" :type dropout: float """ super(Layer, self).__init__(**kwargs) self.index = index self.sources = sources; ":type: list[Layer]" self.num_sources = len(sources) if mask is None: mask = 'none' self.set_attr('mask', mask) self.set_attr('dropout', dropout) self.set_attr('sparse', sparse) self.set_attr('sparse_filtering', sparse_filtering) if not trainable: self.set_attr('trainable', trainable) # only store if not default self.gradient_scale = 0.0 # just to be sure else: self.gradient_scale = gradient_scale if gradient_scale != 1.0: self.set_attr('gradient_scale', gradient_scale) self.set_attr('layer_drop', layer_drop) assert not carry, "not supported anymore" self.set_attr('residual', residual) self.set_attr('n_out', n_out) self.set_attr('L1', L1) self.set_attr('L2', L2) if L2_eye: self.set_attr('L2_eye', L2_eye) self.device = device # if device else str(theano.config.device) for s in self.sources: s.transfer_output(self.device) self.set_attr('varreg', varreg) if output_L2_reg: self.set_attr('output_L2_reg', output_L2_reg) if output_entropy_reg: self.set_attr('output_entropy_reg', output_entropy_reg) if output_entropy_exp_reg: self.set_attr('output_entropy_exp_reg', output_entropy_exp_reg) self.set_attr('batch_norm', batch_norm) if y_in is not None: self.y_in = {} for k in y_in: if not isinstance(y_in[k], T.Variable): continue self.y_in[k] = time_batch_make_flat(y_in[k]) # TODO: better not flatten here... self.y_in[k].n_out = getattr(y_in[k], "n_out", None) else: self.y_in = None self.constraints = T.constant(0) if target: self.set_attr('target', target) if target_index: self.set_attr('target_index', target_index) assert target_index in self.network.j self.index = index = self.network.j[target_index] if cost_scale != 1: self.set_attr("cost_scale", cost_scale) if with_bias: self.b = self.add_param(self.create_bias(n_out), 'b_%s'%self.name) else: self.set_attr('with_bias', False) self.b = numpy.float32(0) self.mass = T.constant(1., name = "mass_%s" % self.name, dtype='float32') self.masks = [None] * len(self.sources) assert mask in ['dropout', 'unity', 'none'], "invalid mask: %s" % mask if mask == "dropout" or (mask == 'none' and dropout > 0): assert 0.0 < dropout < 1.0 # If we apply this mass during training then we don't need any mask or mass for testing. # The expected weight should be 1 in # E[x] = mass * (1-dropout) # so mass has to be 1 / (1 - dropout). self.mass = T.constant(1.0 / (1.0 - dropout), dtype='float32') from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(self.rng.randint(1234) + 1) if self.depth > 1: self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'],self.depth)), theano.config.floatX) for s in self.sources] else: self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'],)), theano.config.floatX) for s in self.sources]
def __init__(self, loss, y, dtype=None, copy_input=None, copy_output=None, time_limit=0, grad_clip_z=None, grad_discard_out_of_bound_z=None, **kwargs): """ :param theano.Variable index: index for batches :param str loss: e.g. 'ce' """ super(OutputLayer, self).__init__(**kwargs) if dtype: self.set_attr('dtype', dtype) if copy_input: self.set_attr("copy_input", copy_input.name) if grad_clip_z is not None: self.set_attr("grad_clip_z", grad_clip_z) if grad_discard_out_of_bound_z is not None: self.set_attr("grad_discard_out_of_bound_z", grad_discard_out_of_bound_z) if not copy_input: self.z = self.b self.W_in = [self.add_param(self.create_forward_weights(source.attrs['n_out'], self.attrs['n_out'], name="W_in_%s_%s" % (source.name, self.name))) for source in self.sources] assert len(self.sources) == len(self.masks) == len(self.W_in) assert len(self.sources) > 0 for source, m, W in zip(self.sources, self.masks, self.W_in): source_output = source.output #4D input from TwoD Layers -> collapse height dimension if source_output.ndim == 4: source_output = source_output.sum(axis=0) if source.attrs['sparse']: if source.output.ndim == 3: input = source_output[:,:,0] # old sparse format else: assert source_output.ndim == 2 input = source.output self.z += W[T.cast(input, 'int32')] elif m is None: self.z += self.dot(source_output, W) else: self.z += self.dot(self.mass * m * source_output, W) else: self.z = copy_input.output assert self.z.ndim == 3 if grad_clip_z is not None: grad_clip_z = numpy.float32(grad_clip_z) self.z = theano.gradient.grad_clip(self.z, -grad_clip_z, grad_clip_z) if grad_discard_out_of_bound_z is not None: grad_discard_out_of_bound_z = numpy.float32(grad_discard_out_of_bound_z) self.z = grad_discard_out_of_bound(self.z, -grad_discard_out_of_bound_z, grad_discard_out_of_bound_z) if not copy_output: self.y = y else: self.index = copy_output.index self.y = copy_output.y_out if isinstance(y, T.Variable): self.y_data_flat = time_batch_make_flat(y) else: assert self.attrs.get("target", "").endswith("[sparse:coo]") assert isinstance(self.y, tuple) assert len(self.y) == 3 s0, s1, weight = self.y from NativeOp import max_and_argmax_sparse n_time = self.z.shape[0] n_batch = self.z.shape[1] mask = self.network.j[self.attrs.get("target", "").replace("[sparse:coo]", "[sparse:coo:2:0]")] out_arg = T.zeros((n_time, n_batch), dtype="float32") out_max = T.zeros((n_time, n_batch), dtype="float32") - numpy.float32(1e16) out_arg, out_max = max_and_argmax_sparse(s0, s1, weight, mask, out_arg, out_max) assert out_arg.ndim == 2 self.y_data_flat = out_arg.astype("int32") self.norm = numpy.float32(1) self.target_index = self.index if time_limit == 'inf': #target_length = self.index.shape[0] #mass = T.cast(T.sum(self.index),'float32') #self.index = theano.ifelse.ifelse(T.gt(self.z.shape[0],target_length),self.sources[0].index,self.index) #self.norm = mass / T.cast(T.sum(self.index),'float32') num = T.cast(T.sum(self.index), 'float32') if self.eval_flag: self.index = self.sources[0].index else: import theano.ifelse padx = T.zeros((T.abs_(self.index.shape[0] - self.z.shape[0]), self.index.shape[1], self.z.shape[2]), 'float32') + self.z[-1] pady = T.zeros((T.abs_(self.index.shape[0] - self.z.shape[0]), self.index.shape[1]), 'int32') #+ y[-1] padi = T.ones((T.abs_(self.index.shape[0] - self.z.shape[0]), self.index.shape[1]), 'int8') self.z = theano.ifelse.ifelse(T.lt(self.z.shape[0], self.index.shape[0]), T.concatenate([self.z,padx],axis=0), self.z) #self.z = theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]),self.z[:self.index.shape[0]], self.z) self.y_data_flat = time_batch_make_flat(theano.ifelse.ifelse(T.gt(self.z.shape[0],self.index.shape[0]), T.concatenate([y,pady], axis=0), y)) #self.index = theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]), T.concatenate([T.ones((self.z.shape[0] - self.index.shape[0],self.z.shape[1]),'int8'), self.index], axis=0), self.index) self.index = theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]), T.concatenate([padi,self.index],axis=0),self.index) self.norm = num / T.cast(T.sum(self.index),'float32') elif time_limit > 0: end = T.min([self.z.shape[0], T.constant(time_limit, 'int32')]) nom = T.cast(T.sum(self.index),'float32') self.index = T.set_subtensor(self.index[end:], T.zeros_like(self.index[end:])) self.norm = nom / T.cast(T.sum(self.index),'float32') self.z = T.set_subtensor(self.z[end:], T.zeros_like(self.z[end:])) #xs = [s.output for s in self.sources] #self.z = AccumulatorOpInstance(*[self.b] + xs + self.W_in) #outputs_info = None #[ T.alloc(numpy.cast[theano.config.floatX](0), index.shape[1], self.attrs['n_out']) ] #self.z, _ = theano.scan(step, # sequences = [s.output for s in self.sources], # non_sequences = self.W_in + [self.b]) self.set_attr('from', ",".join([s.name for s in self.sources])) self.i = (self.index.flatten() > 0).nonzero() self.j = ((1 - self.index.flatten()) > 0).nonzero() self.loss = loss.encode("utf8") self.attrs['loss'] = self.loss if self.loss == 'priori': self.priori = self.shared(value=numpy.ones((self.attrs['n_out'],), dtype=theano.config.floatX), borrow=True) #self.make_output(self.z, collapse = False) # Note that self.output is going to be overwritten in our derived classes. self.output = self.make_consensus(self.z) if self.depth > 1 else self.z
def __init__(self, sources, n_out, index, y_in=None, target=None, target_index=None, sparse=False, cost_scale=1.0, input_scale=1.0, L1=0.0, L2=0.0, L2_eye=None, varreg=0.0, output_L2_reg=0.0, output_entropy_reg=0.0, output_entropy_exp_reg=0.0, with_bias=True, mask="unity", dropout=0.0, batch_drop=False, batch_norm=False, bn_use_sample=False, layer_drop=0.0, residual=False, carry=False, sparse_filtering=False, gradient_scale=1.0, trainable=True, device=None, dtype='float32', **kwargs): """ :param list[NetworkBaseLayer.Layer] sources: list of source layers :param int n_out: output dim of W_in and dim of bias :param float L1: l1-param-norm regularization :param float L2: l2-param-norm regularization :param str mask: "unity" or "dropout" :type dropout: float """ super(Layer, self).__init__(**kwargs) self.index = index self.sources = sources; ":type: list[Layer]" self.num_sources = len(sources) self.D = max([s.D for s in sources if isinstance(s,Layer)] + [0]) if mask is None: mask = 'none' self.set_attr('mask', mask) self.set_attr('dropout', dropout) self.set_attr('sparse', sparse) self.set_attr('bn_use_sample', bn_use_sample) self.set_attr('sparse_filtering', sparse_filtering) if not trainable: self.set_attr('trainable', trainable) # only store if not default self.gradient_scale = 0.0 # just to be sure else: self.gradient_scale = gradient_scale if gradient_scale != 1.0: self.set_attr('gradient_scale', gradient_scale) self.set_attr('layer_drop', layer_drop) assert not carry, "not supported anymore" self.set_attr('residual', residual) self.set_attr('n_out', n_out) self.set_attr('L1', L1) self.set_attr('L2', L2) if L2_eye: self.set_attr('L2_eye', L2_eye) self.device = device # if device else str(theano.config.device) for s in self.sources: s.transfer_output(self.device) self.set_attr('varreg', varreg) if output_L2_reg: self.set_attr('output_L2_reg', output_L2_reg) if output_entropy_reg: self.set_attr('output_entropy_reg', output_entropy_reg) if output_entropy_exp_reg: self.set_attr('output_entropy_exp_reg', output_entropy_exp_reg) self.set_attr('batch_norm', batch_norm) self.set_attr('input_scale', input_scale) if y_in is not None: self.y_in = {} for k in y_in: if not isinstance(y_in[k], T.Variable): continue self.y_in[k] = time_batch_make_flat(y_in[k]) # TODO: better not flatten here... self.y_in[k].n_out = getattr(y_in[k], "n_out", None) else: self.y_in = None self.constraints = T.constant(0) if target: self.set_attr('target', target) if target_index: self.set_attr('target_index', target_index) assert target_index in self.network.j self.index = index = self.network.j[target_index] if cost_scale != 1: self.set_attr("cost_scale", cost_scale) if with_bias: self.b = self.add_param(self.create_bias(n_out), 'b_%s'%self.name) else: self.set_attr('with_bias', False) self.b = numpy.float32(0) self.mass = T.constant(1., name = "mass_%s" % self.name, dtype='float32') self.masks = [None] * len(self.sources) assert mask in ['dropout', 'unity', 'none'], "invalid mask: %s" % mask if mask == "dropout" or (mask == 'none' and dropout > 0): assert 0.0 < dropout < 1.0 # If we apply this mass during training then we don't need any mask or mass for testing. # The expected weight should be 1 in # E[x] = mass * (1-dropout) # so mass has to be 1 / (1 - dropout). self.mass = T.constant(1.0 / (1.0 - dropout), dtype='float32') from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(self.rng.randint(1234) + 1) if self.depth > 1: self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'],self.depth)), theano.config.floatX) for s in self.sources] else: if batch_drop: self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=s.output.shape), theano.config.floatX) for s in self.sources] else: self.masks = [T.cast(srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'],)), theano.config.floatX) for s in self.sources]
def __init__(self, loss, y, dtype=None, copy_input=None, copy_output=None, time_limit=0, use_source_index=False, compute_priors=False, compute_priors_exp_average=0, compute_distortions=False, softmax_smoothing=1.0, grad_clip_z=None, grad_discard_out_of_bound_z=None, normalize_length=False, exclude_labels=[], apply_softmax=True, substract_prior_from_output=False, input_output_similarity=None, input_output_similarity_scale=1, **kwargs): """ :param theano.Variable index: index for batches :param str loss: e.g. 'ce' """ super(OutputLayer, self).__init__(**kwargs) self.set_attr("normalize_length", normalize_length) if dtype: self.set_attr('dtype', dtype) if copy_input: self.set_attr("copy_input", copy_input.name) if grad_clip_z is not None: self.set_attr("grad_clip_z", grad_clip_z) if compute_distortions: self.set_attr("compute_distortions", compute_distortions) if grad_discard_out_of_bound_z is not None: self.set_attr("grad_discard_out_of_bound_z", grad_discard_out_of_bound_z) if not apply_softmax: self.set_attr("apply_softmax", apply_softmax) if substract_prior_from_output: self.set_attr("substract_prior_from_output", substract_prior_from_output) if input_output_similarity: self.set_attr("input_output_similarity", input_output_similarity) self.set_attr("input_output_similarity_scale", input_output_similarity_scale) if use_source_index: self.set_attr("use_source_index", use_source_index) src_index = self.sources[0].index self.index = src_index if not copy_input: self.z = self.b self.W_in = [self.add_param(self.create_forward_weights(source.attrs['n_out'], self.attrs['n_out'], name="W_in_%s_%s" % (source.name, self.name))) for source in self.sources] assert len(self.sources) == len(self.masks) == len(self.W_in) assert len(self.sources) > 0 for source, m, W in zip(self.sources, self.masks, self.W_in): source_output = source.output # 4D input from TwoD Layers -> collapse height dimension if source_output.ndim == 4: source_output = source_output.sum(axis=0) if source.attrs['sparse']: if source.output.ndim == 3: input = source_output[:, :, 0] # old sparse format else: assert source_output.ndim == 2 input = source.output self.z += W[T.cast(input, 'int32')] elif m is None: self.z += self.dot(source_output, W) else: self.z += self.dot(self.mass * m * source_output, W) else: self.z = copy_input.output assert self.z.ndim == 3 if grad_clip_z is not None: grad_clip_z = numpy.float32(grad_clip_z) self.z = theano.gradient.grad_clip(self.z, -grad_clip_z, grad_clip_z) if grad_discard_out_of_bound_z is not None: grad_discard_out_of_bound_z = numpy.float32(grad_discard_out_of_bound_z) self.z = grad_discard_out_of_bound(self.z, -grad_discard_out_of_bound_z, grad_discard_out_of_bound_z) if not copy_output: self.y = y else: self.index = copy_output.index self.y = copy_output.y_out if y is None: self.y_data_flat = None elif isinstance(y, T.Variable): self.y_data_flat = time_batch_make_flat(y) else: assert self.attrs.get("target", "").endswith("[sparse:coo]") assert isinstance(self.y, tuple) assert len(self.y) == 3 s0, s1, weight = self.y from NativeOp import max_and_argmax_sparse n_time = self.z.shape[0] n_batch = self.z.shape[1] mask = self.network.j[self.attrs.get("target", "").replace("[sparse:coo]", "[sparse:coo:2:0]")] out_arg = T.zeros((n_time, n_batch), dtype="float32") out_max = T.zeros((n_time, n_batch), dtype="float32") - numpy.float32(1e16) out_arg, out_max = max_and_argmax_sparse(s0, s1, weight, mask, out_arg, out_max) assert out_arg.ndim == 2 self.y_data_flat = out_arg.astype("int32") self.norm = numpy.float32(1) self.target_index = self.index if time_limit == 'inf': # target_length = self.index.shape[0] # mass = T.cast(T.sum(self.index),'float32') # self.index = theano.ifelse.ifelse(T.gt(self.z.shape[0],target_length),self.sources[0].index,self.index) # self.norm = mass / T.cast(T.sum(self.index),'float32') num = T.cast(T.sum(self.index), 'float32') if self.eval_flag: self.index = self.sources[0].index else: import theano.ifelse padx = T.zeros((T.abs_(self.index.shape[0] - self.z.shape[0]), self.index.shape[1], self.z.shape[2]), 'float32') + self.z[-1] pady = T.zeros((T.abs_(self.index.shape[0] - self.z.shape[0]), self.index.shape[1]), 'int32') # + y[-1] padi = T.ones((T.abs_(self.index.shape[0] - self.z.shape[0]), self.index.shape[1]), 'int8') self.z = theano.ifelse.ifelse(T.lt(self.z.shape[0], self.index.shape[0]), T.concatenate([self.z, padx], axis=0), self.z) # self.z = theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]),self.z[:self.index.shape[0]], self.z) self.y_data_flat = time_batch_make_flat(theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]), T.concatenate([y, pady], axis=0), y)) # self.index = theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]), T.concatenate([T.ones((self.z.shape[0] - self.index.shape[0],self.z.shape[1]),'int8'), self.index], axis=0), self.index) self.index = theano.ifelse.ifelse(T.gt(self.z.shape[0], self.index.shape[0]), T.concatenate([padi, self.index], axis=0), self.index) self.norm *= num / T.cast(T.sum(self.index), 'float32') elif time_limit > 0: end = T.min([self.z.shape[0], T.constant(time_limit, 'int32')]) num = T.cast(T.sum(self.index), 'float32') self.index = T.set_subtensor(self.index[end:], T.zeros_like(self.index[end:])) self.norm = num / T.cast(T.sum(self.index), 'float32') self.z = T.set_subtensor(self.z[end:], T.zeros_like(self.z[end:])) # xs = [s.output for s in self.sources] # self.z = AccumulatorOpInstance(*[self.b] + xs + self.W_in) # outputs_info = None #[ T.alloc(numpy.cast[theano.config.floatX](0), index.shape[1], self.attrs['n_out']) ] # self.z, _ = theano.scan(step, # sequences = [s.output for s in self.sources], # non_sequences = self.W_in + [self.b]) self.set_attr('from', ",".join([s.name for s in self.sources])) index_flat = self.index.flatten() for label in exclude_labels: index_flat = T.set_subtensor(index_flat[(T.eq(self.y_data_flat, label) > 0).nonzero()], numpy.int8(0)) self.i = (index_flat > 0).nonzero() self.j = ((numpy.int32(1) - index_flat) > 0).nonzero() self.loss = as_str(loss.encode("utf8")) self.attrs['loss'] = self.loss if compute_priors: self.set_attr('compute_priors', compute_priors) if compute_priors_exp_average: self.set_attr('compute_priors_exp_average', compute_priors_exp_average) if softmax_smoothing != 1.0: self.attrs['softmax_smoothing'] = softmax_smoothing print >> log.v4, "Logits before the softmax scaled with factor ", softmax_smoothing self.z *= numpy.float32(softmax_smoothing) if self.loss == 'priori': self.priori = self.shared(value=numpy.ones((self.attrs['n_out'],), dtype=theano.config.floatX), borrow=True) if input_output_similarity: # First a self-similarity of input and output, # and then add -similarity or distance between those to the constraints, # so that the input and output correlate on a frame-by-frame basis. # Here some other similarities/distances we could try: # http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html # https://brenocon.com/blog/2012/03/cosine-similarity-pearson-correlation-and-ols-coefficients/ from TheanoUtil import self_similarity_cosine self_similarity = self_similarity_cosine # maybe other data_layer = self.find_data_layer() assert data_layer assert data_layer.output.ndim == 3 n_time = data_layer.output.shape[0] n_batch = data_layer.output.shape[1] findex = T.cast(self.output_index(), "float32") findex_bc = findex.reshape((n_time * n_batch,)).dimshuffle(0, 'x') findex_sum = T.sum(findex) data = data_layer.output.reshape((n_time * n_batch, data_layer.output.shape[2])) * findex_bc assert self.z.ndim == 3 z = self.z.reshape((n_time * n_batch, self.z.shape[2])) * findex_bc data_self_sim = T.flatten(self_similarity(data)) z_self_sim = T.flatten(self_similarity(z)) assert data_self_sim.ndim == z_self_sim.ndim == 1 sim = T.dot(data_self_sim, z_self_sim) # maybe others make sense assert sim.ndim == 0 # sim is ~ proportional to T * T, so divide by T. sim *= numpy.float32(input_output_similarity_scale) / findex_sum self.constraints -= sim # self.make_output(self.z, collapse = False) # Note that self.output is going to be overwritten in our derived classes. self.output = self.make_consensus(self.z) if self.depth > 1 else self.z self.y_m = None # flat log(self.p_y_given_x)