def __init__(self, numpy_rng, n_ins=784, n_outs=24, l1_reg = None, l2_reg = None, hidden_layers_sizes=[500, 500], hidden_activation='tanh', output_activation='linear', var_floor=0.01, n_component=1, beta_opt=False, use_rprop=0, rprop_init_update=0.001, eff_sample_size=0.8, mean_log_det=-100.0): logger = logging.getLogger("Multi-stream DNN initialization") self.sigmoid_layers = [] self.params = [] self.delta_params = [] self.final_layers = [] self.n_outs = n_outs self.n_layers = len(hidden_layers_sizes) self.output_activation = output_activation self.var_floor = var_floor self.use_rprop = use_rprop self.rprop_init_update = rprop_init_update self.l1_reg = l1_reg self.l2_reg = l2_reg self.beta_opt = beta_opt self.eff_sample_size = eff_sample_size self.mean_log_det = mean_log_det assert self.n_layers > 0 # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.matrix('y') for i in xrange(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.tanh) ##T.nnet.sigmoid) # self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) hidden_output_size = hidden_layers_sizes[-1] self.final_layer = MixtureDensityOutputLayer(rng = numpy_rng, input = sigmoid_layer.output, n_in = hidden_output_size, n_out = self.n_outs, n_component = n_component, var_floor = self.var_floor) self.params.extend(self.final_layer.params) self.delta_params.extend(self.final_layer.delta_params) ### Maximum likelihood self.finetune_cost = 0.0 self.errors = 0.0 epsd = self.eff_sample_size**(-2.0/(n_outs + 2.0)) beta = (epsd - 1.0) + math.sqrt(epsd*(epsd - 1.0)) if self.beta_opt: assert n_component == 1, "beta optimisation only implemented for single-component MDNs" for i in xrange(n_component): #n_component sigma = self.final_layer.sigma[:, i*n_outs:(i+1)*n_outs] mu = self.final_layer.mu[:, i*n_outs:(i+1)*n_outs] mix_weight = self.final_layer.mix[:, i] xEx = -0.5 * beta * T.sum(((self.y - mu)**2) * T.inv(sigma), axis=1) exponent = (0.5 * (n_outs + 2.0) * T.log(1 + beta)) + xEx point_fit = T.exp(exponent) - beta log_det_mult = -0.5 * beta * T.sum(T.log(sigma), axis=1) log_det_mult += (0.5 * beta * self.mean_log_det) # normalise by mean_log_det beta_obj = (mix_weight**2) * point_fit * T.exp(log_det_mult) self.finetune_cost += -T.mean(beta_obj) # lines to compute debugging information for later printing #self.errors = T.min(T.min(T.log(sigma), axis=1)) #self.errors = T.mean(T.sum(T.log(sigma), axis=1)) # computes mean_log_det #self.errors = -xEx # (vector quantity) should be about 0.5 * beta * n_outs #self.errors = point_fit # (vector quantity) should be about one #self.errors = T.mean(T.exp(exponent)) / T.exp(T.max(exponent)) # fraction of the data used, should be about efficiency #self.errors = T.mean(point_fit) # should be about one #self.errors = log_det_mult # (vector quantity) about zero, or always less if using Rprop #self.errors = beta_obj # (vector quantity) objective function terms #self.errors = self.finetune_cost # disable this line below when debugging else: all_mix_prob = [] print n_component for i in xrange(n_component): #n_component sigma = self.final_layer.sigma[:, i*n_outs:(i+1)*n_outs] mu = self.final_layer.mu[:, i*n_outs:(i+1)*n_outs] mix_weight = self.final_layer.mix[:, i] xEx = -0.5 * T.sum(((self.y - mu)**2) * T.inv(sigma), axis=1) normaliser = 0.5 * ( n_outs * T.log(2 * numpy.pi) + T.sum(T.log(sigma), axis=1)) exponent = xEx + T.log(mix_weight) - normaliser all_mix_prob.append(exponent) max_exponent = T.max(all_mix_prob, axis=0, keepdims=True) mod_exponent = T.as_tensor_variable(all_mix_prob) - max_exponent self.finetune_cost = - T.mean(max_exponent + T.log(T.sum(T.exp(mod_exponent), axis=0))) #self.errors = self.finetune_cost if self.l2_reg is not None: for i in xrange(self.n_layers-1): W = self.params[i * 2] self.finetune_cost += self.l2_reg * T.sqr(W).sum() self.finetune_cost += self.l2_reg * T.sqr(self.final_layer.W_mu).sum() self.finetune_cost += self.l2_reg * T.sqr(self.final_layer.W_sigma).sum() self.finetune_cost += self.l2_reg * T.sqr(self.final_layer.W_mix).sum() self.errors = self.finetune_cost # disable this line if debugging beta_opt
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, n_outs=10, l1_reg = None, l2_reg = None, hidden_layers_sizes=[500, 500], hidden_activation='tanh', output_activation='linear', projection_insize=100, projection_outsize=10, first_layer_split=True, expand_by_minibatch=False, initial_projection_distrib='gaussian', use_rprop=0, rprop_init_update=0.001): ## beginning at label index 1, 5 blocks of 49 inputs each to be projected to 10 dim. logger = logging.getLogger("TP-DNN initialization") self.projection_insize = projection_insize self.projection_outsize = projection_outsize self.sigmoid_layers = [] self.params = [] self.delta_params = [] self.n_layers = len(hidden_layers_sizes) self.output_activation = output_activation self.use_rprop = use_rprop self.rprop_init_update = rprop_init_update self.l1_reg = l1_reg self.l2_reg = l2_reg assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) self.numpy_rng = numpy_rng # allocate symbolic variables for the data self.x = T.matrix('x') if expand_by_minibatch: self.x_proj = T.ivector('x_proj') else: self.x_proj = T.matrix('x_proj') self.y = T.matrix('y') if expand_by_minibatch: z = theano.tensor.zeros((self.x_proj.shape[0], self.projection_insize)) indexes = self.x_proj one_hot = theano.tensor.set_subtensor(z[theano.tensor.arange(self.x_proj.shape[0]), indexes], 1) projection_input = one_hot else: projection_input = self.x_proj ## Make projection layer self.projection_layer = TokenProjectionLayer(rng=numpy_rng, input=projection_input, projection_insize = self.projection_insize, projection_outsize = self.projection_outsize, initial_projection_distrib=initial_projection_distrib) self.params.extend(self.projection_layer.params) self.delta_params.extend(self.projection_layer.delta_params) first_layer_input = T.concatenate([self.x, self.projection_layer.output], axis=1) for i in xrange(self.n_layers): if i == 0: input_size = n_ins + self.projection_outsize else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = first_layer_input else: layer_input = self.sigmoid_layers[-1].output if i == 0 and first_layer_split: sigmoid_layer = SplitHiddenLayer(rng=numpy_rng, input=layer_input, n_in1=n_ins, n_in2=self.projection_outsize, n_out=hidden_layers_sizes[i], activation=T.tanh) ##T.nnet.sigmoid) # else: sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.tanh) ##T.nnet.sigmoid) # self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # add final layer if self.output_activation == 'linear': self.final_layer = LinearLayer(rng = numpy_rng, input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) elif self.output_activation == 'sigmoid': self.final_layer = SigmoidLayer( rng = numpy_rng, input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs, activation=T.nnet.sigmoid) else: logger.critical("This output activation function: %s is not supported right now!" %(self.output_activation)) sys.exit(1) self.params.extend(self.final_layer.params) self.delta_params.extend(self.final_layer.delta_params) ## params for 2 hidden layers, projection, first split layer, will look like this: ## [W_proj; W_1a, W_1b, b_1; W_2 b_2; W_o, b_o] ### MSE self.finetune_cost = T.mean(T.sum( (self.final_layer.output-self.y)*(self.final_layer.output-self.y), axis=1 )) self.errors = T.mean(T.sum( (self.final_layer.output-self.y)*(self.final_layer.output-self.y), axis=1 ))
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, n_outs=10, l1_reg = None, l2_reg = None, hidden_layers_sizes=[500, 500], hidden_activation='tanh', output_activation='linear', use_rprop=0, rprop_init_update=0.001): logger = logging.getLogger("DNN initialization") self.sigmoid_layers = [] self.params = [] self.delta_params = [] self.n_layers = len(hidden_layers_sizes) self.output_activation = output_activation self.use_rprop = use_rprop self.rprop_init_update = rprop_init_update self.l1_reg = l1_reg self.l2_reg = l2_reg assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') self.y = T.matrix('y') for i in xrange(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.tanh) ##T.nnet.sigmoid) # self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) self.delta_params.extend(sigmoid_layer.delta_params) # add final layer if self.output_activation == 'linear': self.final_layer = LinearLayer(rng = numpy_rng, input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) elif self.output_activation == 'sigmoid': self.final_layer = SigmoidLayer( rng = numpy_rng, input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs, activation=T.nnet.sigmoid) else: logger.critical("This output activation function: %s is not supported right now!" %(self.output_activation)) sys.exit(1) self.params.extend(self.final_layer.params) self.delta_params.extend(self.final_layer.delta_params) ### MSE self.finetune_cost = T.mean(T.sum( (self.final_layer.output-self.y)*(self.final_layer.output-self.y), axis=1 )) self.errors = T.mean(T.sum( (self.final_layer.output-self.y)*(self.final_layer.output-self.y), axis=1 )) ### L1-norm if self.l1_reg is not None: for i in xrange(self.n_layers): W = self.params[i * 2] self.finetune_cost += self.l1_reg * (abs(W).sum()) ### L2-norm if self.l2_reg is not None: for i in xrange(self.n_layers): W = self.params[i * 2] self.finetune_cost += self.l2_reg * T.sqr(W).sum()