def __init__(self, K, vocab_size, num_chars, W_init, nhidden, embed_dim, dropout, train_emb, char_dim, use_feat, gating_fn, save_attn=False): self.nhidden = nhidden self.embed_dim = embed_dim self.dropout = dropout self.train_emb = train_emb self.char_dim = char_dim self.learning_rate = LEARNING_RATE self.num_chars = num_chars self.use_feat = use_feat self.save_attn = save_attn self.gating_fn = gating_fn self.use_chars = self.char_dim!=0 if W_init is None: W_init = lasagne.init.GlorotNormal().sample((vocab_size, self.embed_dim)) doc_var, query_var, cand_var = T.itensor3('doc'), T.itensor3('quer'), \ T.wtensor3('cand') docmask_var, qmask_var, candmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask'), \ T.bmatrix('c_mask') target_var = T.ivector('ans') feat_var = T.imatrix('feat') doc_toks, qry_toks= T.imatrix('dchars'), T.imatrix('qchars') tok_var, tok_mask = T.imatrix('tok'), T.bmatrix('tok_mask') cloze_var = T.ivector('cloze') self.inps = [doc_var, doc_toks, query_var, qry_toks, cand_var, target_var, docmask_var, qmask_var, tok_var, tok_mask, candmask_var, feat_var, cloze_var] self.predicted_probs, predicted_probs_val, self.network, W_emb, attentions = ( self.build_network(K, vocab_size, W_init)) self.loss_fn = T.nnet.categorical_crossentropy(self.predicted_probs, target_var).mean() self.eval_fn = lasagne.objectives.categorical_accuracy(self.predicted_probs, target_var).mean() loss_fn_val = T.nnet.categorical_crossentropy(predicted_probs_val, target_var).mean() eval_fn_val = lasagne.objectives.categorical_accuracy(predicted_probs_val, target_var).mean() self.params = L.get_all_params(self.network, trainable=True) updates = lasagne.updates.adam(self.loss_fn, self.params, learning_rate=self.learning_rate) self.train_fn = theano.function(self.inps, [self.loss_fn, self.eval_fn, self.predicted_probs], updates=updates, on_unused_input='warn') self.validate_fn = theano.function(self.inps, [loss_fn_val, eval_fn_val, predicted_probs_val]+attentions, on_unused_input='warn')
def __init__(self, K, vocab_size, W_init, regularizer, rlambda, nhidden, embed_dim, dropout, train_emb, subsample): self.nhidden = nhidden self.embed_dim = embed_dim self.dropout = dropout self.train_emb = train_emb self.subsample = subsample norm = lasagne.regularization.l2 if regularizer == 'l2' else lasagne.regularization.l1 if W_init is None: W_init = lasagne.init.GlorotNormal().sample( (vocab_size, self.embed_dim)) doc_var, query_var, cand_var = T.itensor3('doc'), T.itensor3( 'quer'), T.wtensor3('cand') docmask_var, qmask_var, candmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask'), \ T.bmatrix('c_mask') target_var = T.ivector('ans') if rlambda > 0.: W_pert = W_init + lasagne.init.GlorotNormal().sample(W_init.shape) else: W_pert = W_init predicted_probs, predicted_probs_val, self.doc_net, self.q_net, W_emb = self.build_network( K, vocab_size, doc_var, query_var, cand_var, docmask_var, qmask_var, candmask_var, W_pert) loss_fn = T.nnet.categorical_crossentropy(predicted_probs, target_var).mean() + \ rlambda*norm(W_emb-W_init) eval_fn = lasagne.objectives.categorical_accuracy( predicted_probs, target_var).mean() loss_fn_val = T.nnet.categorical_crossentropy(predicted_probs_val, target_var).mean() + \ rlambda*norm(W_emb-W_init) eval_fn_val = lasagne.objectives.categorical_accuracy( predicted_probs_val, target_var).mean() params = L.get_all_params(self.doc_net, trainable=True) + \ L.get_all_params(self.q_net, trainable=True) updates = lasagne.updates.adam(loss_fn, params, learning_rate=LEARNING_RATE) self.train_fn = theano.function([doc_var, query_var, cand_var, target_var, docmask_var, \ qmask_var, candmask_var], [loss_fn, eval_fn, predicted_probs], updates=updates) self.validate_fn = theano.function([doc_var, query_var, cand_var, target_var, docmask_var, \ qmask_var, candmask_var], [loss_fn_val, eval_fn_val, predicted_probs_val])
def __init__(self, model): """ Initialize the filtered stim model """ self.model = model self.prms = model['network']['graph'] N = model['N'] self.rho = self.prms['rho'] * np.ones((N, N)) if 'rho_refractory' in self.prms: self.rho[np.diag_indices(N)] = self.prms['rho_refractory'] self.pA = theano.shared(value=self.rho, name='pA') # Define complete adjacency matrix self.A = T.bmatrix('A') # Allow for scaling the log likelihood of the graph so that we can do # Annealed importance sampling self.lkhd_scale = theano.shared(value=1.0, name='lkhd_scale') # Define log probability self.lkhd = T.sum(self.A * np.log(np.minimum(1.0 - 1e-8, self.rho)) + (1 - self.A) * np.log(np.maximum(1e-8, 1.0 - self.rho))) self.log_p = self.lkhd_scale * self.lkhd
def test_local_gpu_elemwise_0(): """ Test local_gpu_elemwise_0 when there is a dtype upcastable to float32 """ a = tensor.bmatrix() b = tensor.fmatrix() c = tensor.fmatrix() a_v = (numpy.random.rand(4, 5) * 10).astype("int8") b_v = (numpy.random.rand(4, 5) * 10).astype("float32") c_v = (numpy.random.rand(4, 5) * 10).astype("float32") # Due to optimization order, this composite is created when all # the op are on the gpu. f = theano.function([a, b, c], [a + b + c], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 f(a_v, b_v, c_v) # Now test with the composite already on the cpu before we move it # to the gpu a_s = theano.scalar.int8() b_s = theano.scalar.float32() c_s = theano.scalar.float32() out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s]) out_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], [out_op(a, b, c)], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 f(a_v, b_v, c_v)
def BuildModel(modelSpecs, forTrain=True): rng = np.random.RandomState() ## x is for sequential features x = T.tensor3('x') ## mask for x xmask = T.bmatrix('xmask') propertyPredictor = ResNet4Properties( rng, seqInput=x, mask_seq=xmask, modelSpecs=modelSpecs ) ## labelList is a list of label matrices, each with shape (batchSize, seqLen, numLabels) labelList = [] if forTrain: ## when this model is used for training. We need to define the label variable labelList = [] for res in modelSpecs['responses']: labelType = Response2LabelType(res) if labelType.startswith('Discrete'): labelList.append( T.itensor3('label4' + res ) ) else: labelList.append( T.tensor3('label4' + res ) ) ## weightList is a list of label weight matices, each with shape (batchSize, seqLen, 1) ## we always use weight to deal with residues without 3D coordinates weightList = [] if len(labelList)>0: weightList = [ T.tensor3('weight4' + res ) for res in modelSpecs['responses'] ] if len(labelList)>0: return propertyPredictor, x, xmask, labelList, weightList else: return propertyPredictor, x, xmask
def __init__(self, nodes_per_layer, act_funcs, err_func, backprop_func, backprop_params, l_rate=.001, batch_size=100): """ layer_shape - number of nodes per layer, including input and output layers act_funcs - list activation functions between the layers err_func - cost/error function backprop_func - backpropagation function l_rate - Learning rate """ assert len(nodes_per_layer)-1 == len(act_funcs), \ ("Invalid number of activation functions compared to the number of hidden layers", len(nodes_per_layer), len(act_funcs)) super(FFNet, self).__init__('FFNet', l_rate, batch_size) logging.info('\tConstructing FFNet with nodes per layer: %s, learning rate: %s ', nodes_per_layer, l_rate) input_data = T.fmatrix('X') input_labels = T.bmatrix('Y') layers = [input_data] # Generate initial random weights between each layer weights = [] for i in range(len(nodes_per_layer)-1): weights.append(init_rand_weights((nodes_per_layer[i], nodes_per_layer[i+1]))) weights[i].name = 'w' + str(i) # logging.debug('\tWeight layers: %s', len(weights)) #logging.info('\tNumber of parameters to train: %s', # sum(param.get_value(borrow=True, return_internal_type=True).size for param in weights)) # Construct the layers with the given activation functions weights between them # logging.info('\tConstructing layers ...') for i in range(len(weights)): layers.append(self.model(layers[i], weights[i], act_funcs[i])) for i in range(1, len(layers)): layers[i].name = 'l' + str(i) output_layer = layers[-1] cost = err_func(output_layer, input_labels) updates = backprop_func(cost, weights, self.l_rate, **backprop_params) prediction = T.argmax(output_layer, axis=1) prediction_value = T.max(output_layer, axis=1) # logging.info('\tConstructing functions ...') self.trainer = theano.function( inputs=[input_data, input_labels], outputs=cost, updates=updates, name='Trainer', allow_input_downcast=True # Allows float64 to be casted as float32, which is necessary in order to use GPU ) self.predictor = theano.function( inputs=[input_data], outputs={'char_as_int': prediction, 'char_probability': prediction_value, 'output_layer': output_layer}, name='Predictor', allow_input_downcast=True )
def __init__(self, model, latent): """ Initialize the stochastic block model for the adjacency matrix """ self.model = model self.latent = latent self.prms = model['network']['graph'] self.N = model['N'] # Get the number of latent types (R) and the latent type vector (Y) self.type_name = self.prms['types'] self.R = self.latent[self.type_name].R self.Y = self.latent[self.type_name].Y # A RxR matrix of connection probabilities per pair of clusters self.B = T.dmatrix('B') # For indexing, we also need Y as a column vector and tiled matrix self.Yv = T.reshape(self.Y, [self.N, 1]) self.Ym = T.tile(self.Yv, [1, self.N]) self.pA = self.B[self.Ym, T.transpose(self.Ym)] # Hyperparameters governing B and alpha self.b0 = self.prms['b0'] self.b1 = self.prms['b1'] # Define complete adjacency matrix self.A = T.bmatrix('A') # Define log probability log_p_B = T.sum((self.b0 - 1) * T.log(self.B) + (self.b1 - 1) * T.log(1 - self.B)) log_p_A = T.sum(self.A * T.log(self.pA) + (1 - self.A) * T.log(1 - self.pA)) self.log_p = log_p_B + log_p_A
def __init__(self, model): """ Initialize the stochastic block model for the adjacency matrix """ self.model = model self.prms = model['network']['graph'] self.N = model['N'] # SBM has R latent clusters self.R = self.prms['R'] # A RxR matrix of connection probabilities per pair of clusters self.B = T.dmatrix('B') # SBM has a latent block or cluster assignment for each node self.Y = T.lvector('Y') # For indexing, we also need Y as a column vector and tiled matrix self.Yv = T.reshape(self.Y, [self.N, 1]) self.Ym = T.tile(self.Yv, [1, self.N]) self.pA = self.B[self.Ym, T.transpose(self.Ym)] # A probability of each cluster self.alpha = T.dvector('alpha') # Hyperparameters governing B and alpha self.b0 = self.prms['b0'] self.b1 = self.prms['b1'] self.alpha0 = self.prms['alpha0'] # Define complete adjacency matrix self.A = T.bmatrix('A') # Define log probability log_p_B = T.sum((self.b0 - 1) * T.log(self.B) + (self.b1 - 1) * T.log(1 - self.B)) log_p_alpha = T.sum((self.alpha0 - 1) * T.log(self.alpha)) log_p_A = T.sum(self.A * T.log(self.pA) + (1 - self.A) * T.log(1 - self.pA)) self.log_p = log_p_B + log_p_alpha + log_p_A
def test_illegal_things(self): i0 = TT.iscalar() i1 = TT.lvector() i2 = TT.bmatrix() self.failUnlessRaises(TypeError, FAS, [i1, slice(None, i2, -1), i0]) self.failUnlessRaises(TypeError, FAS, [i1, slice(None, None, i2), i0]) self.failUnlessRaises(TypeError, FAS, [i1, slice(i2, None, -1), i0])
def __init__(self, model): """ Initialize the filtered stim model """ self.model = model self.prms = model['network']['graph'] N = model['N'] self.rho = self.prms['rho'] * np.ones((N, N)) if 'rho_refractory' in self.prms: self.rho[np.diag_indices(N)] = self.prms['rho_refractory'] self.pA = theano.shared(value=self.rho, name='pA') # Define complete adjacency matrix self.A = T.bmatrix('A') # Allow for scaling the log likelihood of the graph so that we can do # Annealed importance sampling self.lkhd_scale = theano.shared(value=1.0, name='lkhd_scale') # Define log probability self.lkhd = T.sum(self.A * np.log(np.minimum(1.0-1e-8, self.rho)) + (1 - self.A) * np.log(np.maximum(1e-8, 1.0 - self.rho))) self.log_p = self.lkhd_scale * self.lkhd
def ndim_btensor(ndim, name=None): if ndim == 2: return T.bmatrix(name) elif ndim == 3: return T.btensor3(name) elif ndim == 4: return T.btensor4(name) return T.imatrix(name)
def __init__(self, model, latent): """ Initialize the stochastic block model for the adjacency matrix """ self.model = model self.prms = model['network']['graph'] self.N = model['N'] self.N_dims = self.prms['N_dims'] # Get the latent location self.location = latent[self.prms['locations']] self.Lm = self.location.Lm # self.location_prior = create_prior(self.prms['location_prior']) # # # Latent distance model has NxR matrix of locations L # self.L = T.dvector('L') # self.Lm = T.reshape(self.L, (self.N, self.N_dims)) # Compute the distance between each pair of locations # Reshape L into a Nx1xD matrix and a 1xNxD matrix, then add the requisite # broadcasting in order to subtract the two matrices L1 = self.Lm.dimshuffle(0, 'x', 1) # Nx1xD L2 = self.Lm.dimshuffle('x', 0, 1) # 1xNxD T.addbroadcast(L1, 1) T.addbroadcast(L2, 0) #self.D = T.sqrt(T.sum((L1-L2)**2, axis=2)) #self.D = T.sum((L1-L2)**2, axis=2) # It seems we need to use L1 norm for now because # Theano doesn't properly compute the gradients of the L2 # norm. (It gives NaNs because it doesn't realize that some # terms will cancel out) # self.D = (L1-L2).norm(1, axis=2) self.D = T.pow(L1 - L2, 2).sum(axis=2) # There is a distance scale, \delta self.delta = T.dscalar(name='delta') # Define complete adjacency matrix self.A = T.bmatrix('A') # The probability of A is exponentially decreasing in delta # self.pA = T.exp(-1.0*self.D/self.delta) self.pA = T.exp(-0.5 * self.D / self.delta**2) if 'rho_refractory' in self.prms: self.pA += T.eye(self.N) * (self.prms['rho_refractory'] - self.pA) # self.pA[np.diag_indices(self.N)] = self.prms['rho_refractory'] # Allow for scaling the log likelihood of the graph so that we can do # Annealed importance sampling self.lkhd_scale = theano.shared(value=1.0, name='lkhd_scale') # Define log probability self.lkhd = T.sum(self.A * T.log(self.pA) + (1 - self.A) * T.log(1 - self.pA)) # self.log_p = self.lkhd_scale * self.lkhd + self.location_prior.log_p(self.Lm) self.log_p = self.lkhd_scale * self.lkhd
def __init__(self, model, latent): """ Initialize the stochastic block model for the adjacency matrix """ self.model = model self.prms = model['network']['graph'] self.N = model['N'] self.N_dims = self.prms['N_dims'] # Get the latent location self.location = latent[self.prms['locations']] self.Lm = self.location.Lm # self.location_prior = create_prior(self.prms['location_prior']) # # # Latent distance model has NxR matrix of locations L # self.L = T.dvector('L') # self.Lm = T.reshape(self.L, (self.N, self.N_dims)) # Compute the distance between each pair of locations # Reshape L into a Nx1xD matrix and a 1xNxD matrix, then add the requisite # broadcasting in order to subtract the two matrices L1 = self.Lm.dimshuffle(0,'x',1) # Nx1xD L2 = self.Lm.dimshuffle('x',0,1) # 1xNxD T.addbroadcast(L1,1) T.addbroadcast(L2,0) #self.D = T.sqrt(T.sum((L1-L2)**2, axis=2)) #self.D = T.sum((L1-L2)**2, axis=2) # It seems we need to use L1 norm for now because # Theano doesn't properly compute the gradients of the L2 # norm. (It gives NaNs because it doesn't realize that some # terms will cancel out) # self.D = (L1-L2).norm(1, axis=2) self.D = T.pow(L1-L2,2).sum(axis=2) # There is a distance scale, \delta self.delta = T.dscalar(name='delta') # Define complete adjacency matrix self.A = T.bmatrix('A') # The probability of A is exponentially decreasing in delta # self.pA = T.exp(-1.0*self.D/self.delta) self.pA = T.exp(-0.5*self.D/self.delta**2) if 'rho_refractory' in self.prms: self.pA += T.eye(self.N) * (self.prms['rho_refractory']-self.pA) # self.pA[np.diag_indices(self.N)] = self.prms['rho_refractory'] # Allow for scaling the log likelihood of the graph so that we can do # Annealed importance sampling self.lkhd_scale = theano.shared(value=1.0, name='lkhd_scale') # Define log probability self.lkhd = T.sum(self.A * T.log(self.pA) + (1 - self.A) * T.log(1 - self.pA)) # self.log_p = self.lkhd_scale * self.lkhd + self.location_prior.log_p(self.Lm) self.log_p = self.lkhd_scale * self.lkhd
def make_node(self, state, time): """ make node ... :param state: :param time: :return: """ state = T.as_tensor_variable(state) time = T.as_tensor_variable(time) return theano.Apply(self, [state, time], [T.bmatrix()])
def test3_ndarray(self): i0 = TT.iscalar() i1 = TT.lvector() i2 = TT.bmatrix() f = FAS([i1, slice(None, i0, -1), i2]) assert f.n_in == 4 assert f.idx_tuple == (i1.type, slice(0, i0.type, -1), i2.type,) assert f.view_map == {}
def test_any_grad(self): x = tensor.bmatrix("x") x_all = x.any() gx = theano.grad(x_all, x) f = theano.function([x], gx) x_random = self.rng.binomial(n=1, p=0.5, size=(5, 7)).astype("int8") for x_val in (x_random, numpy.zeros_like(x_random), numpy.ones_like(x_random)): gx_val = f(x_val) assert gx_val.shape == x_val.shape assert numpy.all(gx_val == 0)
def use_target(self, target, dtype): if target in self.y: return if target == "null": return if target == 'sizes' and not 'sizes' in self.n_out: #TODO(voigtlaender): fix data please self.n_out['sizes'] = [2,1] if self.base_network: self.base_network.use_target(target=target, dtype=dtype) if not self.y is self.base_network.y: self.y[target] = self.base_network.y[target] if not self.j is self.base_network.j: self.j[target] = self.base_network.j[target] if target not in self.n_out: self.n_out[target] = self.base_network.n_out[target] return if target.endswith("[sparse:coo]"): tprefix = target[:target.index("[")] ndim = self.n_out[target][1] # expected (without batch), e.g. 2 if like (time,feature) # For each coordinate axe. Also with batch-dim. for i in range(ndim): self.y["%s[sparse:coo:%i:%i]" % (tprefix, ndim, i)] = T.TensorType("int32", (False,) * 2)('y_%s[sparse:coo:%i:%i]' % (tprefix, ndim, i)) # And the data itself. Also with batch-dim. self.y["%s[sparse:coo:%i:%i]" % (tprefix, ndim, ndim)] = \ T.TensorType(dtype, (False,) * 2)("y_%s[%i]" % (tprefix, ndim)) # self.j will be used to get the list of keys we need to get from the dataset. for i in range(ndim + 1): self.j.setdefault("%s[sparse:coo:%i:%i]" % (tprefix, ndim, i), T.bmatrix('j_%s[sparse:coo:%i:%i]' % (tprefix, ndim, i))) # self.y[target] will be given to the OutputLayer. self.y[target] = tuple(self.y["%s[sparse:coo:%i:%i]" % (tprefix, ndim, i)] for i in range(ndim + 1)) self.j[target] = self.j["data"] # Not sure if this is the best we can do... return assert target in self.n_out ndim = self.n_out[target][1] + 1 # one more because of batch-dim self.y[target] = T.TensorType(dtype, (False,) * ndim)('y_%s' % target) self.y[target].n_out = self.n_out[target][0] self.j.setdefault(target, T.bmatrix('j_%s' % target)) if getattr(self.y[target].tag, "test_value", None) is None: if ndim == 2: self.y[target].tag.test_value = numpy.zeros((3,2), dtype='int32') elif ndim == 3: self.y[target].tag.test_value = numpy.random.rand(3,2,self.n_out[target][0]).astype('float32') if getattr(self.j[target].tag, "test_value", None) is None: self.j[target].tag.test_value = numpy.ones((3,2), dtype="int8")
def __init__(self, layers, err_func, backprop_func, backprop_params, l_rate, batch_size=10): """ :param layers: :param err_func: cost/error function :param backprop_func: backpropagation function :param backprop_params: parameters to pass to backprop function :param l_rate: learning rate :param batch_size: (mini-) batch size. In comparison to regular nets :return: """ super(ConvNet, self).__init__("ConvNet", l_rate, batch_size) logging.info('\tConstructing ConvNet with %s layers. Learning rate: %s. Batch size: %s ', len(layers), l_rate, batch_size) input_data = T.fmatrix('X') input_labels = T.bmatrix('Y') params = [] # Regular weights and bias weights; e.g. everything to be adjusted during training for layer in layers: for param in layer.params: params.append(param) logging.info('\tNumber of parameters to train: %s', sum(param.get_value(borrow=True, return_internal_type=True).size for param in params)) layers[0].activate(input_data, self.batch_size) for i in range(1, len(layers)): prev_layer = layers[i-1] current_layer = layers[i] current_layer.activate(prev_layer.output(), self.batch_size) output_layer = layers[-1].output_values cost = err_func(output_layer, input_labels) updates = backprop_func(cost, params, l_rate, **backprop_params) prediction = T.argmax(output_layer, axis=1) prediction_value = T.max(output_layer, axis=1) logging.debug('\tConstructing functions ...') self.trainer = theano.function( inputs=[input_data, input_labels], outputs=cost, updates=updates, name='Trainer', allow_input_downcast=True # Allows float64 to be casted as float32, which is necessary in order to use GPU ) self.predictor = theano.function( inputs=[input_data], outputs={'char_as_int': prediction, 'char_probability': prediction_value, 'output_layer': output_layer}, name='Predictor', allow_input_downcast=True )
def __init__(self, nin, nout, nhid, numpy_rng, scale=1.0): self.nin = nin self.nout = nout self.nhid = nhid self.numpy_rng = numpy_rng self.scale = np.float32(scale) self.inputs = T.fmatrix('inputs') self.inputs.tag.test_value = numpy_rng.uniform( low=-1., high=1., size=(16, 5 * self.nin) ).astype(np.float32) self.targets = T.fmatrix('targets') self.targets.tag.test_value = np.ones( (16, 5 * nout), dtype=np.float32) self.masks = T.bmatrix('masks') self.masks.tag.test_value = np.ones( (16, 5), dtype=np.int8) self.batchsize = self.inputs.shape[0] self.inputs_frames = self.inputs.reshape(( self.batchsize, self.inputs.shape[1] / nin, nin)).dimshuffle(1, 0, 2) self.targets_frames = self.targets.reshape(( self.batchsize, self.targets.shape[1] / nout, nout)).dimshuffle(1, 0, 2) self.masks_frames = self.masks.T self.h0 = theano.shared(value=np.ones( nhid, dtype=theano.config.floatX) * np.float32(.5), name='h0') self.win = theano.shared(value=self.numpy_rng.normal( loc=0, scale=0.001, size=(nin, nhid) ).astype(theano.config.floatX), name='win') self.wrnn = theano.shared(value=self.scale * np.eye( nhid, dtype=theano.config.floatX), name='wrnn') self.wout = theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(nhid, nout) ).astype(theano.config.floatX), name='wout') self.bout = theano.shared(value=np.zeros( nout, dtype=theano.config.floatX), name='bout') self.params = [self.win, self.wrnn, self.wout, self.bout] (self.hiddens, self.outputs), self.updates = theano.scan( fn=self.step, sequences=self.inputs_frames, outputs_info=[T.alloc( self.h0, self.batchsize, self.nhid), None]) self._stepcosts = T.sum((self.targets_frames - self.outputs)**2, axis=2) self._cost = T.switch(self.masks_frames > 0, self._stepcosts, 0).mean() self._grads = T.grad(self._cost, self.params) self.getoutputs = theano.function( [self.inputs], self.outputs)
def test_any_grad(self): x = tensor.bmatrix('x') x_all = x.any() gx = theano.grad(x_all, x) f = theano.function([x], gx) x_random = self.rng.binomial(n=1, p=0.5, size=(5, 7)).astype('int8') for x_val in (x_random, numpy.zeros_like(x_random), numpy.ones_like(x_random)): gx_val = f(x_val) assert gx_val.shape == x_val.shape assert numpy.all(gx_val == 0)
def run(gates, num_registers, max_int, num_timesteps, num_layers, reg_lambda, params, clip_gradients=None): params = make_broadcastable(params, clip_gradients=clip_gradients) # Create symbolic variables for the input to the machine # and for the desired output of the machine. initial_mem = dtensor3("InMem") desired_mem = imatrix("OutMem") cost_mask = bmatrix("CostMask") entropy_weight = dscalar("EntropyWeight") # Initialize all registers to zero. Instead of using to_one_hot, # create the shape directly; it's simpler this way. initial_registers = zeros((initial_mem.shape[0], num_registers, max_int), dtype='float64') initial_registers = set_subtensor(initial_registers[:, :, 0], 1.0) # Run the model for all timesteps. The arguments are # registers, memory, cost, cumulative probability complete, # and probability incomplete. The latter are initialized # to zero and to one, respectively. v0 = as_tensor(0) v1 = as_tensor(1) output = (initial_registers, initial_mem, v0, v0, v1) debug = {} for timestep in range(num_timesteps): debug_local, output = step_cost(gates, max_int, desired_mem, cost_mask, num_timesteps, num_registers, num_layers, entropy_weight, timestep + 1, *output, params) debug.update( ("%d:%s" % (timestep, k), v) for (k, v) in debug_local.items()) # Add in regularization, to avoid overfitting simple examples. reg_cost = reg_lambda * sum((p * p).sum() for p in params) debug['cost-regularization'] = reg_cost # Get the final cost: regularization plus loss. final_cost = reg_cost + output[2].sum() debug['cost-final'] = final_cost # Return the symbolic variables, the final cost, and the # intermediate register values for analysis and prediction. mem = output[1] return debug, initial_mem, desired_mem, cost_mask, mem, final_cost, entropy_weight
def BuildModel(modelSpecs, forTrain=True): rng = np.random.RandomState() ## x is for sequential features and y for matrix (or pairwise) features x = T.tensor3('x') y = T.tensor4('y') ## mask for x and y, respectively xmask = T.bmatrix('xmask') ymask = T.btensor3('ymask') xem = None ##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ): if config.EmbeddingUsed(modelSpecs): xem = T.tensor3('xem') distancePredictor = ResNet4DistMatrix( rng, seqInput=x, matrixInput=y, mask_seq=xmask, mask_matrix=ymask, embedInput=xem, modelSpecs=modelSpecs ) else: distancePredictor = ResNet4DistMatrix( rng, seqInput=x, matrixInput=y, mask_seq=xmask, mask_matrix=ymask, modelSpecs=modelSpecs ) ## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] ) labelList = [] if forTrain: ## when this model is used for training. We need to define the label variable for response in modelSpecs['responses']: labelType = Response2LabelType(response) rValDims = config.responseValueDims[labelType] if labelType.startswith('Discrete'): if rValDims > 1: ## if one response is a vector, then we use a 4-d tensor ## wtensor is for 16bit integer labelList.append( T.wtensor4('Tlabel4' + response ) ) else: labelList.append( T.wtensor3('Tlabel4' + response ) ) else: if rValDims > 1: labelList.append( T.tensor4('Tlabel4' + response ) ) else: labelList.append( T.tensor3('Tlabel4' + response ) ) ## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen) weightList = [] if len(labelList)>0 and modelSpecs['UseSampleWeight']: weightList = [ T.tensor3('Tweight4'+response) for response in modelSpecs['responses'] ] ## for prediction, both labelList and weightList are empty return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList
def __init__(self, nin, nout, nhid, numpy_rng, scale=1.0): self.nin = nin self.nout = nout self.nhid = nhid self.numpy_rng = numpy_rng self.theano_rng = RandomStreams(1) self.scale = np.float32(scale) self.inputs = T.fmatrix('inputs') self.targets = T.imatrix('targets') self.masks = T.bmatrix('masks') self.batchsize = self.inputs.shape[0] self.inputs_frames = self.inputs.reshape(( self.batchsize, self.inputs.shape[1]/nin, nin)).dimshuffle(1,0,2) self.targets_frames = self.targets.T self.masks_frames = self.masks.T self.win = theano.shared(value=self.numpy_rng.normal( loc=0, scale=0.001, size=(nin, nhid) ).astype(theano.config.floatX), name='win') self.wrnn = theano.shared(value=self.scale * np.eye( nhid, dtype=theano.config.floatX), name='wrnn') self.wout = theano.shared(value=self.numpy_rng.uniform( low=-0.01, high=0.01, size=(nhid, nout) ).astype(theano.config.floatX), name='wout') self.bout = theano.shared(value=np.zeros( nout, dtype=theano.config.floatX), name='bout') self.params = [self.win, self.wrnn, self.wout, self.bout] (self.hiddens, self.outputs), self.updates = theano.scan( fn=self.step, sequences=self.inputs_frames, outputs_info=[self.theano_rng.uniform(low=0, high=1, size=( self.batchsize, nhid), dtype=theano.config.floatX), None]) self.probabilities = T.nnet.softmax(self.outputs.reshape(( self.outputs.shape[0] * self.outputs.shape[1], self.nout))) self.probabilities = T.clip(self.probabilities, 1e-6, 1-1e-6) self._stepcosts = T.nnet.categorical_crossentropy( self.probabilities, self.targets_frames.flatten()).reshape( self.targets_frames.shape) self._cost = T.switch(T.gt(self.masks_frames, 0), self._stepcosts, 0).mean() self._grads = T.grad(self._cost, self.params) self.get_classifications = theano.function( [self.inputs], T.argmax(self.probabilities.reshape(self.outputs.shape), axis=2).T)
def main(config, tr_stream): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = tr_stream.trg_bos target_space_idx = tr_stream.space_idx['target'] src_vocab = pickle.load(open(config['src_vocab'], 'rb')) logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) # Set up model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") # Reload model if necessary extensions = [LoadNMT(config['saveto'])] # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=None, data_stream=None, extensions=extensions ) for extension in main_loop.extensions: extension.main_loop = main_loop main_loop._run_extensions('before_training') char_embedding = encoder.decimator.apply(source_char_seq.T, source_sample_matrix, source_char_aux.T) embedding(Model(char_embedding), src_vocab)
def __init__(self, K, vocab_size, W_init=lasagne.init.GlorotNormal()): doc_var, query_var = T.itensor3('doc'), T.itensor3('quer') docmask_var, qmask_var, candmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask'), \ T.bmatrix('c_mask') target_var = T.ivector('ans') feat_var = T.bmatrix('feat') predicted_probs, predicted_probs_val, self.doc_net, self.q_net = self.build_network(K, \ vocab_size, doc_var, query_var, docmask_var, qmask_var, candmask_var, feat_var, \ W_init) loss_fn = T.nnet.categorical_crossentropy(predicted_probs, target_var).mean() eval_fn = lasagne.objectives.categorical_accuracy( predicted_probs, target_var).mean() loss_fn_val = T.nnet.categorical_crossentropy(predicted_probs_val, target_var).mean() eval_fn_val = lasagne.objectives.categorical_accuracy( predicted_probs_val, target_var).mean() params = L.get_all_params(self.doc_net, trainable=True) + L.get_all_params(self.q_net, \ trainable=True) updates = lasagne.updates.adam(loss_fn, params, learning_rate=LEARNING_RATE) self.train_fn = theano.function([doc_var, query_var, target_var, docmask_var, qmask_var, \ candmask_var, feat_var], [loss_fn, eval_fn, predicted_probs], updates=updates) self.validate_fn = theano.function([doc_var, query_var, target_var, docmask_var, qmask_var, \ candmask_var, feat_var], [loss_fn_val, eval_fn_val, predicted_probs_val])
def __init__(self, vocab_size, num_classes, W_init=lasagne.init.GlorotNormal()): doc_var, query_var = T.imatrix('doc'), T.imatrix('quer') docmask_var, qmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask') target_var = T.imatrix('ans') feat_var = T.bmatrix('feat') self.inps = [ doc_var, query_var, target_var, docmask_var, qmask_var, feat_var ] loss, self.params, test_out = self.build_network(vocab_size, num_classes, \ W_init, *self.inps) loss = loss + REGULARIZATION*lasagne.regularization.apply_penalty(self.params, \ lasagne.regularization.l2) updates = lasagne.updates.rmsprop(loss, self.params, learning_rate=LEARNING_RATE, \ rho=0.95, epsilon=0.0001) self.train_fn = theano.function(self.inps, loss, updates=updates) self.validate_fn = theano.function(self.inps, test_out, on_unused_input='warn')
def __init__(self, name, config): super().__init__(name) self.config = config self.param('src_embeddings', (len(config['src_encoder']), config['src_embedding_dims']), init_f=Gaussian(fan_in=config['src_embedding_dims'])) self.param('trg_embeddings', (len(config['trg_encoder']), config['trg_embedding_dims']), init_f=Gaussian(fan_in=config['trg_embedding_dims'])) self.add(Linear('hidden', config['decoder_state_dims'], config['trg_embedding_dims'])) self.add(Linear('emission', config['trg_embedding_dims'], len(config['trg_encoder']), w=self._trg_embeddings.T)) for prefix, backwards in (('fwd', False), ('back', True)): self.add(Sequence( prefix+'_encoder', LSTM, backwards, config['src_embedding_dims'] + ( config['encoder_state_dims'] if backwards else 0), config['encoder_state_dims'], layernorm=config['encoder_layernorm'], dropout=config['encoder_dropout'], trainable_initial=True, offset=0)) self.add(Sequence( 'decoder', LSTM, False, config['trg_embedding_dims'], config['decoder_state_dims'], layernorm=config['decoder_layernorm'], dropout=config['decoder_dropout'], attention_dims=config['attention_dims'], attended_dims=2*config['encoder_state_dims'], trainable_initial=False, offset=-1)) h_t = T.matrix('h_t') self.predict_fun = function( [h_t], T.nnet.softmax(self.emission(T.tanh(self.hidden(h_t))))) inputs = T.lmatrix('inputs') inputs_mask = T.bmatrix('inputs_mask') self.encode_fun = function( [inputs, inputs_mask], self.encode(inputs, inputs_mask))
def build_network(input_size, hidden_size, constraint_adj=False): P = Parameters() X = T.bmatrix('X') P.W_input_hidden = U.initial_weights(input_size, hidden_size) P.b_hidden = U.initial_weights(hidden_size) P.b_output = U.initial_weights(input_size) hidden_lin = T.dot(X, P.W_input_hidden) + P.b_hidden hidden = T.nnet.sigmoid(hidden_lin) output = T.nnet.softmax(T.dot(hidden, P.W_input_hidden.T) + P.b_output) parameters = P.values() cost = build_error(X, output, P) if constraint_adj: pass #cost = cost + adjacency_constraint(hidden_lin) return X, output, cost, P
def build_network(input_size,hidden_size,constraint_adj=False): P = Parameters() X = T.bmatrix('X') P.W_input_hidden = U.initial_weights(input_size,hidden_size) P.b_hidden = U.initial_weights(hidden_size) P.b_output = U.initial_weights(input_size) hidden_lin = T.dot(X,P.W_input_hidden)+P.b_hidden hidden = T.nnet.sigmoid(hidden_lin) output = T.nnet.softmax(T.dot(hidden,P.W_input_hidden.T) + P.b_output) parameters = P.values() cost = build_error(X,output,P) if constraint_adj:pass #cost = cost + adjacency_constraint(hidden_lin) return X,output,cost,P
def run(gates, num_registers, max_int, num_timesteps, num_layers, reg_lambda, params, clip_gradients=None): params = make_broadcastable(params, clip_gradients=clip_gradients) # Create symbolic variables for the input to the machine # and for the desired output of the machine. initial_mem = dtensor3("InMem") desired_mem = imatrix("OutMem") cost_mask = bmatrix("CostMask") entropy_weight = dscalar("EntropyWeight") # Initialize all registers to zero. Instead of using to_one_hot, # create the shape directly; it's simpler this way. initial_registers = zeros((initial_mem.shape[0], num_registers, max_int), dtype='float64') initial_registers = set_subtensor(initial_registers[:, :, 0], 1.0) # Run the model for all timesteps. The arguments are # registers, memory, cost, cumulative probability complete, # and probability incomplete. The latter are initialized # to zero and to one, respectively. v0 = as_tensor(0) v1 = as_tensor(1) output = (initial_registers, initial_mem, v0, v0, v1) debug = {} for timestep in range(num_timesteps): debug_local, output = step_cost(gates, max_int, desired_mem, cost_mask, num_timesteps, num_registers, num_layers, entropy_weight, timestep + 1, *output, params) debug.update(("%d:%s" % (timestep, k), v) for (k, v) in debug_local.items()) # Add in regularization, to avoid overfitting simple examples. reg_cost = reg_lambda * sum((p * p).sum() for p in params) debug['cost-regularization'] = reg_cost # Get the final cost: regularization plus loss. final_cost = reg_cost + output[2].sum() debug['cost-final'] = final_cost # Return the symbolic variables, the final cost, and the # intermediate register values for analysis and prediction. mem = output[1] return debug, initial_mem, desired_mem, cost_mask, mem, final_cost, entropy_weight
def __init__(self, model): """ Initialize the stochastic block model for the adjacency matrix """ self.model = model self.prms = model['network']['graph'] self.N = model['N'] self.N_dims = self.prms['N_dims'] # Create a location prior self.location_prior = create_prior(self.prms['location_prior']) # Latent distance model has NxR matrix of locations L self.L = T.dvector('L') self.Lm = T.reshape(self.L, (self.N, self.N_dims)) # Compute the distance between each pair of locations # Reshape L into a Nx1xD matrix and a 1xNxD matrix, then add the requisite # broadcasting in order to subtract the two matrices L1 = self.Lm.dimshuffle(0,'x',1) # Nx1xD L2 = self.Lm.dimshuffle('x',0,1) # 1xNxD T.addbroadcast(L1,1) T.addbroadcast(L2,0) #self.D = T.sqrt(T.sum((L1-L2)**2, axis=2)) #self.D = T.sum((L1-L2)**2, axis=2) # Bummer, to get the gradients to work we need to use L1 norm # Theano isn't smart enough to handle the self.D = (L1-L2).norm(1, axis=2) # There is a distance scale, \delta self.delta = T.dscalar(name='delta') # Define complete adjacency matrix self.A = T.bmatrix('A') # The probability of A is exponentially decreasing in delta self.pA = T.exp(-1.0*self.D/self.delta) if 'rho_refractory' in self.prms: self.pA += T.eye(self.N) * (self.prms['rho_refractory']-self.pA) # self.pA[np.diag_indices(self.N)] = self.prms['rho_refractory'] # Define log probability self.log_p = T.sum(self.A * T.log(self.pA) + (1 - self.A) * T.log(1 - self.pA)) + \ self.location_prior.log_p(self.L)
def make_node(self, state, time): """Creates an Apply node representing the application of the op on the inputs provided. Parameters ---------- state : array_like The state to transform into feature space time : int The current time being processed Returns ------- theano.Apply [description] """ state = T.as_tensor_variable(state) time = T.as_tensor_variable(time) return theano.Apply(self, [state, time], [T.bmatrix()])
def declare_theano_variables(output_layer, model, verbose=True): """ Define target, network output, cost and learning rate. Parameters ---------- output_layer: Lasagne layer Output layer. model: model specification file Contains the model config. verbose: bool Print info if True. Returns ------- target: Theano tensor Prediction target. stochastic_out: tuple Theano tensors for stochastic output and cost. deterministic_out: tuple Theano tensors for deterministic output and cost. learning_rate: Theano shared variable Learning rate for the optimizers. """ if verbose: print('\tDeclaring theano variables...') # scale learning rate by a factor of 0.9 if momentum is applied, # to counteract the larger update steps that momentum yields lr = model.learning_rate - 0.9 * model.learning_rate * model.momentum learning_rate = theano.shared(np.asarray(lr, dtype=theano.config.floatX)) # define target placeholder for the cost functions target = T.bmatrix('target') # stochastic cost expression stochastic_out = define_cost(output_layer, target, model, determ=False) # deterministic cost expression deterministic_out = define_cost(output_layer, target, model, determ=True) return target, stochastic_out, deterministic_out, learning_rate
def _initialize_predict_function(self): def predicted_note_step(time_model_output, *states): previous_note_model_input = states[-1] note_model_input = T.concatenate([time_model_output, previous_note_model_input]) previous_hidden_state = list(states[:-1]) note_model_output = self.note_model.forward(note_model_input, prev_hiddens=previous_hidden_state) probabilities = note_model_output[-1] generator = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) is_note_played = probabilities[0] > generator.uniform() is_note_articulated = (probabilities[1] > generator.uniform()) * is_note_played prediction = T.cast(T.stack(is_note_played, is_note_articulated), 'int8') return note_model_output + [prediction] def predicted_time_step(*states): time_model_input = states[-2] previous_hidden_state = list(states[:-2]) time_model_output = self.time_model.forward(time_model_input, prev_hiddens=previous_hidden_state) time_model_output_last_layer = time_model_output[-1] initial_note = T.alloc(0, output_size) note_outputs_info = self.get_time_prediction_outputs_info(initial_note) notes_model_output, updates = theano.scan(fn=predicted_note_step, sequences=[time_model_output_last_layer], outputs_info=note_outputs_info) output = notes_model_output[-1] time = states[-1] next_input = OutputTransformer()(output, time + 1) return (time_model_output + [next_input, time + 1, output]), updates length = T.iscalar() initial_note = T.bmatrix() num_notes = initial_note.shape[0] time_outputs_info = self.get_prediction_outputs_info(num_notes, initial_note) time_model_output, updates = theano.scan(fn=predicted_time_step, outputs_info=time_outputs_info, n_steps=length) prediction = time_model_output[-1] self.predict = theano.function([length, initial_note], outputs=prediction, updates=updates, allow_input_downcast=True)
def build_model(self): print("Building model and compiling functions...") self.sentences_macro_batch = theano.shared(np.empty((self.macro_batch_size,) + self.sentences[0].shape[1:], dtype=np.int32), borrow=True) self.masks_macro_batch = theano.shared(np.empty((self.macro_batch_size,) + self.masks[0].shape[1:], dtype=np.int8), borrow=True) self.labels_macro_batch = theano.shared(np.empty((self.macro_batch_size,) + self.labels[0].shape[1:], dtype=theano.config.floatX), borrow=True) sentences_in = T.imatrix('sentences') masks_in = T.bmatrix('masks') labels_in = T.fvector('labels') i = T.iscalar() flattened = self.define_layers(sentences_in,masks_in) self.model = flattened prediction = T.clip(lasagne.layers.get_output(flattened),1.0e-7, 1.0 - 1.0e-7) test_prediction = T.clip(lasagne.layers.get_output(flattened, deterministic=True), 1.0e-7, 1.0 - 1.0e-7) loss,test_loss = self.define_losses(prediction,test_prediction,labels_in) params = lasagne.layers.get_all_params(flattened, trainable=True) updates = lasagne.updates.adadelta(loss, params) self.train_fn = theano.function([i], [loss, prediction], updates=updates, givens={ sentences_in: self.sentences_macro_batch[i * self.micro_batch_size:(i + 1) * self.micro_batch_size], masks_in: self.masks_macro_batch[i * self.micro_batch_size:(i + 1) * self.micro_batch_size], labels_in: self.labels_macro_batch[i * self.micro_batch_size:(i + 1) * self.micro_batch_size]}) self.train_rest_fn = theano.function([i], [loss, prediction], updates=updates, givens={ sentences_in: self.sentences_macro_batch[:i], masks_in: self.masks_macro_batch[:i], labels_in: self.labels_macro_batch[:i]}) self.test_fn = theano.function([i], [test_loss, test_prediction], givens={ sentences_in: self.sentences_macro_batch[i * self.micro_batch_size:(i + 1) * self.micro_batch_size], masks_in: self.masks_macro_batch[i * self.micro_batch_size:(i + 1) * self.micro_batch_size], labels_in: self.labels_macro_batch[i * self.micro_batch_size:(i + 1) * self.micro_batch_size]}) self.test_rest_fn = theano.function([i], [test_loss, test_prediction], givens={ sentences_in: self.sentences_macro_batch[:i], masks_in: self.masks_macro_batch[:i], labels_in: self.labels_macro_batch[:i]})
def __init__(self, model): """ Initialize the filtered stim model """ self.model = model self.prms = model['network']['graph'] N = model['N'] self.rho = self.prms['rho'] * np.ones((N, N)) if 'rho_refractory' in self.prms: self.rho[np.diag_indices(N)] = self.prms['rho_refractory'] self.pA = theano.shared(value=self.rho, name='pA') # Define complete adjacency matrix self.A = T.bmatrix('A') # Define log probability self.log_p = T.sum(self.A * np.log(np.minimum(1.0-1e-8, self.rho)) + (1 - self.A) * np.log(np.maximum(1e-8, 1.0 - self.rho)))
def test1_ndarray(self): i0 = TT.iscalar() i1 = TT.lvector() i2 = TT.bmatrix() f = FAS([i0]) assert f.idx_tuple == (i0.type,) assert f.view_map == {0:[0]} assert f.n_in == 2 f = FAS([i1]) assert f.idx_tuple == (i1.type,) assert f.view_map == {} assert f.n_in == 2 f = FAS([i2]) assert f.idx_tuple == (i2.type,) assert f.view_map == {} assert f.n_in == 2
def simpleRecurrentModel( word2vec, inputVocabSize, batch_size, maxGrad, hiddenSize, ): print("Building Model ...") # Input Layer l_in = lasagne.layers.InputLayer((batch_size, None), T.imatrix()) l_mask = lasagne.layers.InputLayer((batch_size, None), T.bmatrix()) #Embedding Layer l_embedding = lasagne.layers.EmbeddingLayer(incoming=l_in, input_size=inputVocabSize, output_size=word2vecDimension, W=word2vec) # Sentence Encoding l_encoding = lasagne.layers.GRULayer(l_embedding, hiddenSize, mask_input=l_mask, grad_clipping=maxGrad, only_return_final=True) # Intermediate Processing Layer l_classify = lasagne.layers.DenseLayer( l_encoding, num_units=hiddenSize, nonlinearity=lasagne.nonlinearities.rectify) # Predicting sentiment l_out = lasagne.layers.DenseLayer( l_classify, num_units=1, nonlinearity=lasagne.nonlinearities.sigmoid) return l_in, l_mask, l_out
def GetProbFunctions(num_features, learning_rate=1e-4, ret_updates=True): adjustment_var = T.bmatrix(name='Adjustment matrix') features_var = T.fmatrix(name='Features') mask_var = T.bvector(name='Filter mask') reward_var = T.scalar(name='Reward') net = BuildGraphNetwork(adjustment_var, features_var, mask_var, num_features) desc = lasagne.layers.get_output(net['desc']) prob = msoftmax(theano.gradient.grad_clip(desc, -1, 1)) reward_grad = reward_var / prob params = lasagne.layers.get_all_params(net['desc'], trainable=True) grads = theano.grad(None, params, known_grads={prob: reward_grad}) updates = lasagne.updates.momentum(grads, params, learning_rate=learning_rate) action_fn = theano.function([adjustment_var, features_var, mask_var], prob) if ret_updates: updates_fn = theano.function( [adjustment_var, features_var, mask_var, reward_var], [], updates=updates, allow_input_downcast=True) return net, action_fn, updates_fn else: return net, action_fn
def train(): if not os.path.exists(train_dataset_path): generate_dataset() train_x, train_x_mask, train_y = cPickle.load(open(train_dataset_path, 'r')) valid_x, valid_x_mask, valid_y = cPickle.load(open(valid_dataset_path, 'r')) num_train_batchs = len(train_y) / batch_size num_valid_batchs = len(valid_y) / valid_batch_size print 't: %d, tb: %d, v: %d, vb: %d'%(len(train_y), num_train_batchs, len(valid_y), num_valid_batchs) shared_x_train, shared_y_train = shared_dataset(train_x, train_y) shared_mask = shared_data(train_x_mask, dtype = 'int8') shared_x_valid, shared_y_valid = shared_dataset(valid_x, valid_y) shared_valid_mask = shared_data(valid_x_mask, dtype = 'int8') index = T.lscalar('index') input_var = T.lmatrix('input') target_var = T.ivector('target') mask_var = T.bmatrix('mask') network = build_model(max_seq_length, input_var, mask_var) prediction = lasagne.layers.get_output(network) test_output = lasagne.layers.get_output(network, deterministic=True) test_acc = T.mean( T.eq(T.argmax(test_output, axis = 1), target_var), dtype = theano.config.floatX) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var).mean() params = lasagne.layers.get_all_params(network, trainable = True) updates = lasagne.updates.adadelta(loss, params, learning_rate) train_fn = theano.function([index], outputs = loss, updates = updates, givens={ input_var: shared_x_train[index * batch_size: (index + 1) * batch_size], target_var: shared_y_train[index * batch_size: (index + 1) * batch_size], mask_var: shared_mask[index * batch_size: (index + 1) * batch_size], } ) valid_fn = theano.function([index], outputs = test_acc, givens={ input_var: shared_x_valid[index * valid_batch_size: (index + 1) * valid_batch_size], target_var: shared_y_valid[index * valid_batch_size: (index + 1) * valid_batch_size], mask_var: shared_valid_mask[index * valid_batch_size: (index + 1) * valid_batch_size], } ) print 'compile over...' best_acc = 0.0 for epoch in xrange(num_epoch): loss = 0.0 acc = 0.0 indices = range(0, num_train_batchs) numpy.random.shuffle(indices) start_time = time.time() for batch in indices: loss += train_fn(batch) valid_indices = range(0, num_valid_batchs) for batch in valid_indices: acc += valid_fn(batch) loss /= num_train_batchs acc /= num_valid_batchs print("Epoch {} of {} took {:.3f}s".format( epoch + 1, num_epoch, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(loss)) print(" valid accuracy:\t\t{:.2f} %\n".format(acc * 100)) if best_acc < acc: best_acc = acc cPickle.dump((input_var, mask_var, network), open(lstm_path, 'w')) print 'save lstm to %s, best valid accuracy: %.2f%%\n'%(lstm_path, best_acc * 100)
def make_node(self, state, time): state = T.as_tensor_variable(state) time = T.as_tensor_variable(time) return theano.Apply(self, [state, time], [T.bmatrix()])
def __init__(self, K, vocab_size, num_chars, W_init, regularizer, rlambda, nhidden, embed_dim, dropout, train_emb, subsample, char_dim, use_feat): self.nhidden = nhidden self.embed_dim = embed_dim self.dropout = dropout self.train_emb = train_emb self.subsample = subsample self.char_dim = char_dim self.learning_rate = LEARNING_RATE self.num_chars = num_chars self.use_feat = use_feat norm = lasagne.regularization.l2 if regularizer == 'l2' else lasagne.regularization.l1 self.use_chars = self.char_dim != 0 if W_init is None: W_init = lasagne.init.GlorotNormal().sample( (vocab_size, self.embed_dim)) doc_var, query_var, cand_var = T.itensor3('doc'), T.itensor3('quer'), \ T.wtensor3('cand') docmask_var, qmask_var, candmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask'), \ T.bmatrix('c_mask') target_var = T.ivector('ans') feat_var = T.imatrix('feat') doc_toks, qry_toks = T.imatrix('dchars'), T.imatrix('qchars') tok_var, tok_mask = T.imatrix('tok'), T.bmatrix('tok_mask') cloze_var = T.ivector('cloze') self.inps = [ doc_var, doc_toks, query_var, qry_toks, cand_var, target_var, docmask_var, qmask_var, tok_var, tok_mask, candmask_var, feat_var, cloze_var ] if rlambda > 0.: W_pert = W_init + lasagne.init.GlorotNormal().sample(W_init.shape) else: W_pert = W_init self.predicted_probs, predicted_probs_val, self.doc_net, self.q_net, W_emb = ( self.build_network(K, vocab_size, W_pert)) self.loss_fn = T.nnet.categorical_crossentropy(self.predicted_probs, target_var).mean() + \ rlambda*norm(W_emb-W_init) self.eval_fn = lasagne.objectives.categorical_accuracy( self.predicted_probs, target_var).mean() loss_fn_val = T.nnet.categorical_crossentropy(predicted_probs_val, target_var).mean() + \ rlambda*norm(W_emb-W_init) eval_fn_val = lasagne.objectives.categorical_accuracy( predicted_probs_val, target_var).mean() self.params = L.get_all_params([self.doc_net] + self.q_net, trainable=True) updates = lasagne.updates.adam(self.loss_fn, self.params, learning_rate=self.learning_rate) self.train_fn = theano.function( self.inps, [self.loss_fn, self.eval_fn, self.predicted_probs], updates=updates, on_unused_input='warn') self.validate_fn = theano.function( self.inps, [loss_fn_val, eval_fn_val, predicted_probs_val], on_unused_input='warn')
def setup_predict(self): # In prediction mode, note steps are contained in the time steps. So the passing gets a little bit hairy. self.predict_seed = T.bmatrix() self.steps_to_simulate = T.iscalar() def step_time(*states): # States is [ *hiddens, prev_result, time] hiddens = list(states[:-2]) in_data = states[-2] time = states[-1] # correct for dropout if self.dropout > 0: masks = [1 - self.dropout for layer in self.time_model.layers] masks[0] = None else: masks = [] new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks) # Now new_states is a list of matrix [layer](notes, hidden_states) for each layer time_final = get_last_layer(new_states) start_note_values = theano.tensor.alloc(0, 2) # This gets a little bit complicated. In the training case, we can pass in a combination of the # time net's activations with the known choices. But in the prediction case, those choices don't # exist yet. So instead of iterating over the combination, we iterate over only the activations, # and then combine in the previous outputs in the step. And then since we are passing outputs to # previous inputs, we need an additional outputs_info for the initial "previous" output of zero. note_outputs_info = ([ initial_state_with_taps(layer) for layer in self.pitch_model.layers ] + [ dict(initial=start_note_values, taps=[-1]) ]) notes_result, updates = theano.scan(fn=self._predict_step_note, sequences=[time_final], outputs_info=note_outputs_info) # Now notes_result is a list of matrix [layer/output](notes, onOrArtic) output = get_last_layer(notes_result) next_input = OutputFormToInputFormOp()(output, time + 1) # TODO: Fix time #next_input = T.cast(T.alloc(0, 3, 4),'int64') return (ensure_list(new_states) + [ next_input, time + 1, output ]), updates num_notes = self.predict_seed.shape[0] time_outputs_info = ([ initial_state_with_taps(layer, num_notes) for layer in self.time_model.layers ] + [ dict(initial=self.predict_seed, taps=[-1]), dict(initial=0, taps=[-1]), None ]) time_result, updates = theano.scan( fn=step_time, outputs_info=time_outputs_info, n_steps=self.steps_to_simulate ) self.predict_thoughts = time_result self.predicted_output = time_result[-1] self.predict_fun = theano.function( inputs=[self.steps_to_simulate, self.conservativity, self.predict_seed], outputs=self.predicted_output, updates=updates, allow_input_downcast=True) self.predict_thought_fun = theano.function( inputs=[self.steps_to_simulate, self.conservativity, self.predict_seed], outputs=ensure_list(self.predict_thoughts), updates=updates, allow_input_downcast=True)
def ready(self): embedding_layer = self.embedding_layer args = self.args padding_id = embedding_layer.vocab_map["<padding>"] dropout = self.dropout = theano.shared( np.float64(args.dropout).astype(theano.config.floatX)) # len*batch x = self.x = T.imatrix() z = self.z = T.bmatrix() z = z.dimshuffle((0, 1, "x")) # batch*nclasses y = self.y = T.fmatrix() n_d = args.hidden_dimension n_e = embedding_layer.n_d activation = get_activation_by_name(args.activation) layers = self.layers = [] depth = args.depth layer_type = args.layer.lower() for i in range(depth): if layer_type == "rcnn": l = ExtRCNN(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation, order=args.order) elif layer_type == "lstm": l = ExtLSTM(n_in=n_e if i == 0 else n_d, n_out=n_d, activation=activation) layers.append(l) # len * batch * 1 masks = T.cast( T.neq(x, padding_id).dimshuffle((0, 1, "x")) * z, theano.config.floatX) # batch * 1 cnt_non_padding = T.sum(masks, axis=0) + 1e-8 # (len*batch)*n_e embs = embedding_layer.forward(x.ravel()) # len*batch*n_e embs = embs.reshape((x.shape[0], x.shape[1], n_e)) embs = apply_dropout(embs, dropout) pooling = args.pooling lst_states = [] h_prev = embs for l in layers: # len*batch*n_d h_next = l.forward_all(h_prev, z) if pooling: # batch * n_d masked_sum = T.sum(h_next * masks, axis=0) lst_states.append(masked_sum / cnt_non_padding) # mean pooling else: lst_states.append(h_next[-1]) # last state h_prev = apply_dropout(h_next, dropout) if args.use_all: size = depth * n_d # batch * size (i.e. n_d*depth) h_final = T.concatenate(lst_states, axis=1) else: size = n_d h_final = lst_states[-1] h_final = apply_dropout(h_final, dropout) output_layer = self.output_layer = Layer(n_in=size, n_out=self.nclasses, activation=sigmoid) # batch * nclasses preds = self.preds = output_layer.forward(h_final) # batch loss_mat = self.loss_mat = (preds - y)**2 loss = self.loss = T.mean(loss_mat) pred_diff = self.pred_diff = T.mean( T.max(preds, axis=1) - T.min(preds, axis=1)) params = self.params = [] for l in layers + [output_layer]: for p in l.params: params.append(p) nparams = sum(len(x.get_value(borrow=True).ravel()) \ for x in params) say("total # parameters: {}\n".format(nparams)) l2_cost = None for p in params: if l2_cost is None: l2_cost = T.sum(p**2) else: l2_cost = l2_cost + T.sum(p**2) l2_cost = l2_cost * args.l2_reg self.l2_cost = l2_cost cost = self.cost = loss * 10 + l2_cost
def test_local_gpu_elemwise_0(): """ Test local_gpu_elemwise_0 when there is a dtype upcastable to float32 """ a = tensor.bmatrix() b = tensor.fmatrix() c = tensor.fmatrix() a_v = (numpy.random.rand(4, 5) * 10).astype("int8") b_v = (numpy.random.rand(4, 5) * 10).astype("float32") c_v = (numpy.random.rand(4, 5) * 10).astype("float32") # Due to optimization order, this composite is created when all # the op are on the gpu. f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) # Now test with the composite already on the cpu before we move it # to the gpu a_s = theano.scalar.int8() b_s = theano.scalar.float32() c_s = theano.scalar.float32() out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s]) out_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) # Test multiple output a_s = theano.scalar.float32() a = tensor.fmatrix() from theano.scalar.basic import identity out_s = theano.scalar.Composite( [a_s, b_s, c_s], [identity(a_s), identity(c_s), identity(b_s)]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v) utt.assert_allclose(out[1], c_v) utt.assert_allclose(out[2], b_v) # Test multiple output out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * c_s]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v + b_v) utt.assert_allclose(out[1], a_v * c_v) # Test non-contiguous input c = cuda.shared_constructor(c_v) f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]), mode=mode_with_gpu) out = f(a_v, b_v) utt.assert_allclose(out[0], a_v[::2] + b_v[::2]) utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
RESNET_SGDM_LR.set_value(RESNET_SGDM_LR.get_value() * EPOCH_LR_COEFF) RECURR_SGDM_LR.set_value(RECURR_SGDM_LR.get_value() * EPOCH_LR_COEFF) ADAM_EPOCHS = 0 else: for _ in xrange(max_epoch): RESNET_ADAM_LR.set_value(RESNET_ADAM_LR.get_value() * EPOCH_LR_COEFF) RECURR_ADAM_LR.set_value(RECURR_ADAM_LR.get_value() * EPOCH_LR_COEFF) NUM_EPOCHS -= max_epoch param_values_file = 'ln_hs_param_values_{}.pkl'.format(max_epoch) logger.info('Building the network.') im_features = lasagne.layers.get_output(resnet['pool5']) im_features = T.flatten(im_features, outdim=2) # batch size, number of features cap_out_var = T.imatrix('cap_out') # batch size, seq len cap_in_var = T.imatrix('cap_in') # batch size, seq len mask_var = T.bmatrix('mask_var') # batch size, seq len gate = lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=lasagne.init.Normal(), b=lasagne.init.Constant(0.0)) cell_gate = lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=None, b=lasagne.init.Constant(0.0), nonlinearity=lasagne.nonlinearities.tanh) forget_gate = lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(), W_cell=lasagne.init.Normal(), b=lasagne.init.Constant(5.0)) l_in = lasagne.layers.InputLayer((None, None), cap_in_var, name="l_in") l_mask = lasagne.layers.InputLayer((None, None), mask_var, name="l_mask") l_hid = lasagne.layers.InputLayer((None, HIDDEN_SIZE), input_var=im_features, name="l_hid") l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=WORD_SIZE, output_size=EMBEDDING_SIZE, name="l_emb") l_lstm = LNLSTMLayer(l_emb, HIDDEN_SIZE, ingate=gate, forgetgate=forget_gate, cell=cell_gate, outgate=gate, hid_init=l_hid, peepholes=True, grad_clipping=RNN_GRAD_CLIP, mask_input=l_mask, precompute_input=False, alpha_init=lasagne.init.Constant(0.2), # as suggested by Ryan Kiros on Twitter
def main(num_epochs=DEFAULT_NUM_EPOCHS, batch_size=DEFAULT_BATCH_SIZE): input_var = T.tensor4('inputs') target_var = T.bmatrix('targets') network = build_neural_network( input_var, (batch_size, 3, IMAGE_SIZE, IMAGE_SIZE))['prob'] prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.binary_crossentropy(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adadelta(loss, params) test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.binary_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_accuracy = T.mean(T.eq(T.gt(test_prediction, 0.5), T.eq(target_var, 1.0)), dtype=theano.config.floatX) train_function = theano.function([input_var, target_var], loss, updates=updates) validation_function = theano.function([input_var, target_var], [test_loss, test_accuracy]) number_of_image_files = get_number_of_image_files_in_path() print "Number of files found: ", number_of_image_files print("Starting training...") # Move this to the epoch loop and remove the cycle for lower memory computers data_generator = cycle(get_input_images_and_ouput_labels()) best_accuracy = 0 for epoch in range(num_epochs): train_generator = get_percentage_of_generator(data_generator, number_of_image_files, batch_size, 0.6) validation_generator = get_percentage_of_generator( data_generator, number_of_image_files, batch_size, 0.2) test_generator = get_percentage_of_generator(data_generator, number_of_image_files, batch_size, 0.2) # In each epoch, we do a full pass over the training data: train_error = 0 train_batches = 0 start_time = time.time() while True: try: batch = generate_minibatches(train_generator, batch_size) except StopIteration: break if len(batch) != batch_size: break X, Y = get_input_and_output_from_batch(batch) train_error += train_function(X, Y) train_batches += 1 print "Finished training for epoch {} with a total of {} batches".format( epoch + 1, train_batches) # And a full pass over the validation data: val_error = 0 val_accuracy = 0 val_batches = 0 while True: try: batch = generate_minibatches(validation_generator, batch_size) except StopIteration: break if len(batch) != batch_size: break X, Y = get_input_and_output_from_batch(batch) err, acc = validation_function(X, Y) val_error += err val_accuracy += acc val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_error / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_error / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format(val_accuracy / val_batches * 100)) print(" train / valid:\t\t{:.6f}".format( (train_error / train_batches) / (val_error / val_batches))) if val_accuracy > best_accuracy: print "Accuracy better than previous best, saving model" write_model_data( network, "models/model%s.pkl" % int(val_accuracy / val_batches * 100)) best_accuracy = val_accuracy # After training, we compute and print the test error: test_error = 0 test_accuracy = 0 test_batches = 0 while True: try: batch = generate_minibatches(test_generator, batch_size) except StopIteration: break if len(batch) != batch_size: break X, Y = get_input_and_output_from_batch(batch) err, acc = validation_function(X, Y) test_error += err test_accuracy += acc test_batches += 1 print("Final results:") print(" test loss:\t\t\t{:.6f}".format(test_error / test_batches)) print(" test accuracy:\t\t{:.2f} %".format(test_accuracy / test_batches * 100))
def __init__(self, args, params=None, attention=False, bidir=False, subset_grad=True, pyramid=False): self.rnn_dim = args.rnn_dim self.rlayers = args.rlayers self.attention = attention lr = T.scalar(dtype=floatX) pdrop = T.scalar(dtype=floatX) max_norm = T.scalar(dtype=floatX) # initialize input tensors src_sent = T.imatrix('src_sent') rev_src_sent = T.imatrix('rev_src_sent') src_mask = T.bmatrix('src_mask') tgt_sent = T.imatrix('tgt_sent') tgt_mask = T.bmatrix('tgt_mask') space_mask = T.bmatrix('space_mask') # build up model # https://groups.google.com/forum/#!topic/torch7/-NBrFw8Q6_s # NOTE can't use one-hot here because huge matrix multiply self.L_enc = theano.shared(uniform_init(args.src_vocab_size, args.rnn_dim, scale=0.1), 'L_enc', borrow=True) self.L_dec = theano.shared(uniform_init(args.tgt_vocab_size, args.rnn_dim, scale=0.1), 'L_dec', borrow=True) enc_input = src_sent if not args.reverse else rev_src_sent if bidir: print('Using bidirectional encoder') self.encoder = BiRNNEncoder(src_sent.T, rev_src_sent.T, src_mask.T, space_mask.T, self.L_enc, pdrop, args) elif pyramid: print('Using pyramid encoder') self.encoder = BiPyrRNNEncoder(src_sent.T, rev_src_sent.T, src_mask.T, self.L_enc, pdrop, args) else: self.encoder = RNNEncoder(enc_input.T, src_mask.T, space_mask.T, self.L_enc, pdrop, args) if attention: self.decoder = RNNDecoderAttention(self.encoder, tgt_sent.T, tgt_mask.T, self.L_dec, pdrop, args) hs = self.decoder.hs else: self.decoder = RNNDecoder(self.encoder.out, tgt_sent.T, tgt_mask.T, self.L_dec, pdrop, args) # cost, parameters, grads, updates self.cost = self.decoder.cost self.params = self.encoder.params + self.decoder.params + [self.L_enc, self.L_dec] if subset_grad: # for speed self.grad_params = self.encoder.params + self.decoder.params + [self.encoder.subset, self.decoder.subset] self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)(self.cost, self.grad_params, lr, max_norm=max_norm) # instead of updating L_enc and L_dec only want to update the embeddings indexed, so use inc_subtensor/set_subtensor # http://deeplearning.net/software/theano/tutorial/faq_tutorial.html self.updates[-2] = (self.L_enc, T.set_subtensor(self.updates[-2][0], self.updates[-2][1])) self.updates[-1] = (self.L_dec, T.set_subtensor(self.updates[-1][0], self.updates[-1][1])) else: self.grad_params = self.params self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)(self.cost, self.grad_params, lr, max_norm=max_norm) self.nparams = np.sum([np.prod(p.shape.eval()) for p in self.params]) # functions self.train = theano.function( inputs=[src_sent, src_mask, rev_src_sent, tgt_sent, tgt_mask, space_mask, pdrop, lr, max_norm], outputs=[self.cost, self.grad_norm, self.param_norm], updates = self.updates, on_unused_input='warn', allow_input_downcast=True ) self.test = theano.function( inputs=[src_sent, src_mask, rev_src_sent, tgt_sent, tgt_mask, space_mask, theano.In(pdrop, value=0.0)], outputs=self.cost, updates=None, on_unused_input='warn' ) outputs=self.encoder.out if attention: outputs = self.encoder.out + [hs] self.encode = theano.function( inputs=[src_sent, rev_src_sent, src_mask, space_mask, theano.In(pdrop, value=0.0)], outputs=outputs, on_unused_input='warn', updates=None ) # function for decoding step by step i_t = T.ivector() x_t = self.L_dec[i_t, :] h_ps = list() # previous for k in xrange(args.rlayers): h_ps.append(T.matrix()) h_ts = list() dmask = T.ones_like(h_ps[0]).astype(floatX) if attention and args.rlayers == 1: h_t, _ = self.decoder.rlayers[0]._step(x_t, dmask, h_ps[0], hs) else: h_t = self.decoder.rlayers[0]._step(x_t, dmask, h_ps[0]) h_ts.append(h_t) # NOTE no more dropout nodes here for k in xrange(1, args.rlayers): if attention and args.rlayers == k + 1: h_t, align = self.decoder.rlayers[k]._step(h_t, dmask, h_ps[k], hs) else: h_t = self.decoder.rlayers[k]._step(h_t, dmask, h_ps[k]) h_ts.append(h_t) E_t = T.dot(h_t, self.decoder.olayer.W) + self.decoder.olayer.b E_t = T.exp(E_t - T.max(E_t, axis=1, keepdims=True)) p_t = E_t / E_t.sum(axis=1, keepdims=True) inputs=[i_t] + h_ps outputs=[p_t] + h_ts if attention: inputs = inputs + [hs] outputs = outputs + [align] self.decode_step = theano.function( inputs=inputs, outputs=outputs, updates=None )
def test_local_gpu_elemwise(): """ Test local_gpu_elemwise when there is a dtype upcastable to float32 """ a = tensor.bmatrix() b = tensor.fmatrix() c = tensor.fmatrix() a_v = (numpy.random.rand(4, 5) * 10).astype("int8") b_v = (numpy.random.rand(4, 5) * 10).astype("float32") c_v = (numpy.random.rand(4, 5) * 10).astype("float32") # Due to optimization order, this composite is created when all # the op are on the gpu. f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) # Now test with the composite already on the cpu before we move it # to the gpu a_s = theano.scalar.int8() b_s = theano.scalar.float32() c_s = theano.scalar.float32() out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s]) out_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) return # Not yet implemeted # Test multiple output a_s = theano.scalar.float32() a = tensor.fmatrix() from theano.scalar.basic import identity out_s = theano.scalar.Composite([a_s, b_s, c_s], [identity(a_s), identity(c_s), identity(b_s)]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v) utt.assert_allclose(out[1], c_v) utt.assert_allclose(out[2], b_v) # Test multiple output out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v + b_v) utt.assert_allclose(out[1], a_v * c_v) # Test non-contiguous input c = gpuarray_shared_constructor(numpy.asarray(c_v, dtype="float32")) f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]), mode=mode_with_gpu) out = f(a_v, b_v) utt.assert_allclose(out[0], a_v[::2] + b_v[::2]) utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
def __init__(self, n_actions, replay_memory, build_network, updates, screen_size, initial_weights_file=None): self.screen_width, self.screen_height = screen_size self.mood_q = None self.last_q = 0 self.n_parameter_updates = 0 self.alpha = 0.00025 # update frequency ? # gradient momentum ? 0.95 # squared gradient momentum ? 0.95 # min squared gradient ? 0.01 self.save_every_n_frames = 100000 # ~ once per hour self.final_exploration_frame = 1000000 self.replay_start_size = 50000 self.i_action = 0 self.state = None self.initial_epsilon = 1 self.final_epsilon = 0.1 self.epsilon = self.initial_epsilon self.gamma = 0.99 self.replay_memory = replay_memory self.log_frequency = 1 self.minibatch_size = 32 # self.replay_memory_size = 1000000 self.target_network_update_frequency = 10000 s0_var = T.tensor4("s0", dtype=theano.config.floatX) a0_var = T.bmatrix("a0") r0_var = T.wcol("r0") s1_var = T.tensor4("s1", dtype=theano.config.floatX) future_reward_indicator_var = T.bcol("future_reward_indicator") self.n_actions = n_actions self.a_lookup = np.eye(self.n_actions, dtype=np.int8) self.network = build_network(n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256), screen_size=(self.screen_height, self.screen_width)) print("Compiling forward.") self.forward = theano.function([s0_var], lasagne.layers.get_output(self.network, deterministic=True)) self.network_stale = build_network(n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256), screen_size=(self.screen_height, self.screen_width)) print("Compiling forward_stale.") self.forward_stale = theano.function([s1_var], lasagne.layers.get_output(self.network_stale, deterministic=True)) self._update_network_stale() out = lasagne.layers.get_output(self.network) out_stale = lasagne.layers.get_output(self.network_stale) self.loss, self.err, __y, __q = build_loss(out=out, out_stale=out_stale, a0_var=a0_var, r0_var=r0_var, future_reward_indicator_var=future_reward_indicator_var, gamma=self.gamma) params = lasagne.layers.get_all_params(self.network, trainable=True) print("Compiling train_fn.") self.train_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], [self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale], updates=updates(self.loss, params)) print("Compiling loss_fn.") self.loss_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], self.loss)
def execute(dataset, n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, n_hidden_s, embedding_source=histo_GenotypicFrequency_perclass, additional_unsup_input=None, num_epochs=500, learning_rate=.001, learning_rate_annealing=1.0, alpha=1, beta=1, delta=1, gamma=1, lmd=.0001, disc_nonlinearity="sigmoid", encoder_net_init=0.2, decoder_net_init=0.2, optimizer="rmsprop", max_patience=100, batchnorm=0, input_dropout=1.0, embedding_noise=0.0, keep_labels=1.0, prec_recall_cutoff=True, missing_labels_val=-1.0, which_fold=0, early_stop_criterion='loss_sup_det', input_decoder_mode="regression", save_path='/Users/Marie-Elyse/Downloads/embedding2', save_copy='/Users/Marie-Elyse/Downloads/embedding2', dataset_path='/Users/Marie-Elyse/Downloads/embedding2', resume=False, exp_name='', random_proj=0, bootstrap_snp_embeddings=0, bootstrap_cutoff=0.9): # Prepare embedding information : # - If no embedding is specified, use the transposed input matrix # - If a file is specified, use it's content as feature embeddings # - Else (a embedding category like 'histo3x26' is provided), load a # pregenerated embedding of the specified category if embedding_source is None or embedding_source == "raw": embedding_source = None embedding_input = 'raw' elif os.path.exists(embedding_source): embedding_input = embedding_source else: embedding_input = embedding_source embedding_source = os.path.join( dataset_path, embedding_input + '_fold' + str(which_fold) + '.npy') # Load the dataset print("Loading data") (x_train, y_train, exmpl_ids_train, x_valid, y_valid, exmpl_ids_valid, x_test, y_test, exmpl_ids_test, x_unsup, training_labels, feature_names, label_names) = mlh.load_data(dataset, dataset_path, embedding_source, which_fold=which_fold, keep_labels=keep_labels, missing_labels_val=missing_labels_val, embedding_input=embedding_input, norm=False) # Load the additional unsupervised data, if some is specified if additional_unsup_input is not None: print("Adding additional data to the model's unsupervised inputs") paths = additional_unsup_input.split(";") additional_unsup_data = [np.load(p) for p in paths] print(x_unsup.shape) x_unsup = np.hstack(additional_unsup_data + [x_unsup]) print(x_unsup.shape) if x_unsup is not None: n_samples_unsup = x_unsup.shape[1] else: n_samples_unsup = 0 original_x_train = x_train.copy() original_x_valid = x_valid.copy() original_x_test = x_test.copy() # Change how the missing data values are encoded. Right now they are # encoded as being the mean of the corresponding feature so that, after # feature normalization, they will be 0s. However, this prevents us from # transfering the minibatch data as int8 so we replace those values with -1s. for i in range(x_train.shape[1]): feature_mean = x_train[:, i].mean() x_train[:, i] = mh.replace_arr_value(x_train[:, i], feature_mean, -1) x_valid[:, i] = mh.replace_arr_value(x_valid[:, i], feature_mean, -1) x_test[:, i] = mh.replace_arr_value(x_test[:, i], feature_mean, -1) x_train = x_train.astype("int8") x_valid = x_valid.astype("int8") x_test = x_test.astype("int8") # Normalize the input data. The mlh.load_data() function already offers # this feature but we need to do it here so that we will have access to # both the normalized and unnormalized input data. norm_mus = original_x_train.mean(axis=0) norm_sigmas = original_x_train.std(axis=0) + 1e-6 #x_train = (x_train - norm_mus[None, :]) / norm_sigmas[None, :] #x_valid = (x_valid - norm_mus[None, :]) / norm_sigmas[None, :] #x_test = (x_test - norm_mus[None, :]) / norm_sigmas[None, :] #x_train *= (315345. / 553107) #x_valid *= (315345. / 553107) #x_test *= (315345. / 553107) # Setup variables to build the right type of decoder bases on the value of # `input_decoder_mode` assert input_decoder_mode in ["regression", "classification"] if input_decoder_mode == "regression": # The size of the input reconstruction will be the same as the number # of inputs decoder_encoder_unit_ratio = 1 elif input_decoder_mode == "classification": # # The size of the input reconstruction will be the N times larger as # the number of inputs where N is the number of distinct discrete # values that each input can take. For SNP input data with an additive # coding scheme, N=3 because the 3 possible values are : {0, 1, 2}. nb_discrete_vals_by_input = int(original_x_train.max() + 1) decoder_encoder_unit_ratio = nb_discrete_vals_by_input # Print baseline accuracy for the imputation of genes print("Distribution of input values in valid: %f %f %f" % ((original_x_train == 0).mean(), (original_x_train == 1).mean(), (original_x_train == 2).mean())) print("Distribution of input values in test: %f %f %f" % ((original_x_test == 0).mean(), (original_x_test == 1).mean(), (original_x_test == 2).mean())) # Extract required information from data n_samples, n_feats = x_train.shape print("Number of features : ", n_feats) print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1])) n_targets = y_train.shape[1] if y_train.ndim == 2 else y_train.max() + 1 # Set some variables batch_size = 138 beta = gamma if (gamma == 0) else beta # Generate an name for the experiment based on the hyperparameters used if embedding_source is None: embedding_name = embedding_input else: embedding_name = embedding_source.replace("_", "").split(".")[0] exp_name += embedding_name.rsplit('/', 1)[::-1][0] + '_' exp_name += mlh.define_exp_name( keep_labels, alpha, beta, gamma, lmd, n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, n_hidden_s, which_fold, learning_rate, decoder_net_init, encoder_net_init, batchnorm, input_dropout, embedding_noise, early_stop_criterion, learning_rate_annealing, input_decoder_mode) print("Experiment: " + exp_name) # Ensure that the folders where the results of the experiment will be # saved do exist. Create them if they don't. save_path = os.path.join(save_path, dataset, exp_name) save_copy = os.path.join(save_copy, dataset, exp_name) if not os.path.exists(save_path): os.makedirs(save_path) if not os.path.exists(save_copy): os.makedirs(save_copy) # Prepare Theano variables for inputs and targets input_var_sup = T.bmatrix('input_sup') input_var_unsup = theano.shared(x_unsup, 'input_unsup') # x_unsup TBD target_var_sup = T.matrix('target_sup') lr = theano.shared(np.float32(learning_rate), 'learning_rate') # Use the provided mus and sigmas to process the missing values and # normalize the inputs b_input_var_sup = input_var_sup.astype("float32") normed_input_sup = (T.eq(b_input_var_sup, -1) * norm_mus + T.neq(b_input_var_sup, -1) * b_input_var_sup) normed_input_sup = (normed_input_sup - norm_mus) / norm_sigmas reconst_target_sup = T.cast(input_var_sup, "int32") # Build model print("Building model") # Some checkings # assert len(n_hidden_u) > 0 assert len(n_hidden_t_enc) > 0 assert len(n_hidden_t_dec) > 0 assert n_hidden_t_dec[-1] == n_hidden_t_enc[-1] # Build feature embedding networks (encoding and decoding if gamma > 0) nets, embeddings, pred_feat_emb = mh.build_feat_emb_nets( embedding_source, n_feats, n_samples_unsup, input_var_unsup, n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, gamma, encoder_net_init, decoder_net_init, save_path, random_proj, decoder_encoder_unit_ratio, embedding_noise) # Build feature embedding reconstruction networks (if alpha > 0, beta > 0) nets += mh.build_feat_emb_reconst_nets( [alpha, beta], n_samples_unsup, n_hidden_u, [n_hidden_t_enc, n_hidden_t_dec], nets, [encoder_net_init, decoder_net_init]) # Supervised network discrim_net, hidden_rep = mh.build_discrim_net( batch_size, n_feats, normed_input_sup, n_hidden_t_enc, n_hidden_s, embeddings[0], disc_nonlinearity, n_targets, batchnorm, input_dropout) # Reconstruct network nets += [ mh.build_reconst_net(hidden_rep, embeddings[1] if len(embeddings) > 1 else None, n_feats * decoder_encoder_unit_ratio, gamma, decoder_encoder_unit_ratio) ] # Load weights if we are resuming job if resume: # Load best model with np.load(os.path.join(save_copy, 'dietnet_best.npz')) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] nlayers = len( lasagne.layers.get_all_params(filter(None, nets) + [discrim_net])) #lasagne.layers.set_all_param_values(filter(None, nets) + # [discrim_net], # param_values[:nlayers]) params = lasagne.layers.get_all_params( filter(None, nets) + [discrim_net]) for p, v in zip(params, param_values[:nlayers]): # Do not overwrite embedding value with old embedding. Removing # the following condition will prevent a trained model from being # tested on a different dataset if p.name != "feat_emb": p.set_value(v) print("Building and compiling training functions") # Build and compile training functions predictions, predictions_det = mh.define_predictions(nets, start=2) prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net]) prediction_sup = prediction_sup[0] prediction_sup_det = prediction_sup_det[0] # Define losses # reconstruction losses if input_decoder_mode == "regression": reconst_losses, reconst_losses_det = mh.define_reconst_losses( predictions, predictions_det, [input_var_unsup, input_var_unsup, normed_input_sup]) elif input_decoder_mode == "classification": # Obtain regular reconstruction losses for every reconstruction # but the reconstruction of the supervised input data reconst_losses1, reconst_losses_det1 = mh.define_reconst_losses( predictions[:-1], predictions_det[:-1], [input_var_unsup, input_var_unsup]) # Obtain a "classification" reconstruction loss for the reconstruction # of the supervised input data. This classification loss will be # performed on the input data without normalization reconst_losses2, reconst_losses_det2 = mh.define_classif_reconst_losses( predictions[-1:], predictions_det[-1:], [reconst_target_sup], [decoder_encoder_unit_ratio]) reconst_losses = reconst_losses1 + reconst_losses2 reconst_losses_det = reconst_losses_det1 + reconst_losses_det2 # supervised loss sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity, prediction_sup, prediction_sup_det, keep_labels, target_var_sup, missing_labels_val) # Define inputs inputs = [input_var_sup, target_var_sup] # Define parameters params = lasagne.layers.get_all_params([discrim_net] + filter(None, nets), trainable=True, unwrap_shared=False) params_to_freeze= \ lasagne.layers.get_all_params(filter(None, nets), trainable=False, unwrap_shared=False) # Remove unshared variables from params and params_to_freeze params = [ p for p in params if isinstance(p, theano.compile.sharedvalue.SharedVariable) ] params_to_freeze = [ p for p in params_to_freeze if isinstance(p, theano.compile.sharedvalue.SharedVariable) ] print("Params : ", params) feat_emb_var = next(p for p in lasagne.layers.get_all_params([discrim_net]) if p.name == 'input_unsup' or p.name == 'feat_emb') # feat_emb_var = lasagne.layers.get_all_params([discrim_net])[0] print(feat_emb_var) feat_emb_val = feat_emb_var.get_value() feat_emb_norms = (feat_emb_val**2).sum(0)**0.5 feat_emb_var.set_value(feat_emb_val / feat_emb_norms) print('Number of params discrim: ' + str(len(params))) print('Number of params to freeze: ' + str(len(params_to_freeze))) for p in params_to_freeze: new_params = [el for el in params if el != p] params = new_params print('Number of params to update: ' + str(len(params))) # Combine losses loss = delta*sup_loss + alpha*reconst_losses[0] + beta*reconst_losses[1] + \ gamma*reconst_losses[2] loss_det = delta*sup_loss_det + alpha*reconst_losses_det[0] + \ beta*reconst_losses_det[1] + gamma*reconst_losses_det[2] l2_penalty = apply_penalty(params, l2) loss = loss + lmd * l2_penalty loss_det = loss_det + lmd * l2_penalty # Compute network updates assert optimizer in ["rmsprop", "adam", "amsgrad"] if optimizer == "rmsprop": updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr) elif optimizer == "adam": updates = lasagne.updates.adam(loss, params, learning_rate=lr) elif optimizer == "amsgrad": updates = lasagne.updates.amsgrad(loss, params, learning_rate=lr) #updates = lasagne.updates.sgd(loss, # params, # learning_rate=lr) # updates = lasagne.updates.momentum(loss, params, # learning_rate=lr, momentum=0.0) # Apply norm constraints on the weights for k in updates.keys(): if updates[k].ndim == 2: updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0) # Compile training function train_fn = theano.function(inputs, loss, updates=updates, on_unused_input='ignore') # Monitoring Labels monitor_labels = [ "reconst. feat. W_enc", "reconst. feat. W_dec", "reconst. loss" ] monitor_labels = [ i for i, j in zip(monitor_labels, reconst_losses) if j != 0 ] monitor_labels += ["feat. W_enc. mean", "feat. W_enc var"] monitor_labels += ["feat. W_dec. mean", "feat. W_dec var"] if \ (embeddings[1] is not None) else [] monitor_labels += ["loss. sup.", "total loss"] # Build and compile test function val_outputs = reconst_losses_det val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0] val_outputs += [embeddings[0].mean(), embeddings[0].var()] val_outputs += [embeddings[1].mean(), embeddings[1].var()] if \ (embeddings[1] is not None) else [] val_outputs += [sup_loss_det, loss_det] # Compute supervised accuracy and add it to monitoring list test_acc, test_pred = mh.define_test_functions(disc_nonlinearity, prediction_sup, prediction_sup_det, target_var_sup) monitor_labels.append("accuracy") val_outputs.append(test_acc) # If appropriate, compute the input reconstruction accuracy and add it to # the monitoring list if input_decoder_mode == "classification": input_reconst_acc = mh.define_classif_reconst_acc( predictions_det[-1], reconst_target_sup, decoder_encoder_unit_ratio) #import pdb; pdb.set_trace() monitor_labels.append("input_reconst_acc") val_outputs.append(input_reconst_acc) # Compile prediction function predict = theano.function([input_var_sup], test_pred) predict_from_normed_inps = theano.function([normed_input_sup], test_pred) predict_scores = theano.function([input_var_sup], prediction_sup_det) predict_scores_from_normed_inps = theano.function([input_var_sup], prediction_sup_det) # Compile validation function val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs, on_unused_input='ignore') # Finally, launch the training loop. print("Starting training...") # Some variables patience = 0 train_monitored = [] valid_monitored = [] train_loss = [] # Pre-training monitoring print("Epoch 0 of {}".format(num_epochs)) train_minibatches = mlh.iterate_minibatches(x_train, y_train, batch_size, shuffle=False) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, prec_recall_cutoff) valid_minibatches = mlh.iterate_minibatches(x_valid, y_valid, batch_size, shuffle=False) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, prec_recall_cutoff) # Before starting training, save a copy of the model in case np.savez( os.path.join(save_path, 'dietnet_best.npz'), *lasagne.layers.get_all_param_values( filter(None, nets) + [discrim_net])) # Training loop start_training = time.time() for epoch in range(num_epochs): start_time = time.time() print("Epoch {} of {}".format(epoch + 1, num_epochs)) nb_minibatches = 0 loss_epoch = 0 # Train pass for batch in mlh.iterate_minibatches(x_train, training_labels, batch_size, shuffle=True): loss_epoch += train_fn(*batch) nb_minibatches += 1 loss_epoch /= nb_minibatches train_loss += [loss_epoch] # Monitoring on the training set train_minibatches = mlh.iterate_minibatches(x_train, y_train, batch_size, shuffle=False) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, prec_recall_cutoff) train_monitored += [train_err] # Monitoring on the validation set valid_minibatches = mlh.iterate_minibatches(x_valid, y_valid, batch_size, shuffle=False) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, prec_recall_cutoff) valid_monitored += [valid_err] try: early_stop_val = valid_err[monitor_labels.index( early_stop_criterion)] except: raise ValueError("There is no monitored value by the name of %s" % early_stop_criterion) valid_loss_sup_hist = [ v[monitor_labels.index("loss. sup.")] for v in valid_monitored ] valid_loss_sup = valid_loss_sup_hist[-1] # Early stopping if epoch == 0: best_valid = early_stop_val elif ((early_stop_val > best_valid and early_stop_criterion == 'input_reconst_acc') or (early_stop_val > best_valid and early_stop_criterion == 'accuracy') or (early_stop_val >= best_valid and early_stop_criterion == 'accuracy' and valid_loss_sup == min(valid_loss_sup_hist)) or (early_stop_val < best_valid and early_stop_criterion == 'loss. sup.')): best_valid = early_stop_val patience = 0 # Save stuff np.savez( os.path.join(save_path, 'dietnet_best.npz'), *lasagne.layers.get_all_param_values( filter(None, nets) + [discrim_net])) np.savez(save_path + "/errors_supervised_best.npz", zip(*train_monitored), zip(*valid_monitored)) # Monitor on the test set now because sometimes the saving doesn't # go well and there isn't a model to load at the end of training if y_test is not None: test_minibatches = mlh.iterate_minibatches(x_test, y_test, 138, shuffle=False) test_err = mlh.monitoring(test_minibatches, "test", val_fn, monitor_labels, prec_recall_cutoff) else: patience += 1 # Save stuff np.savez( os.path.join(save_path, 'dietnet_last.npz'), *lasagne.layers.get_all_param_values( filter(None, nets) + [discrim_net])) np.savez(save_path + "/errors_supervised_last.npz", zip(*train_monitored), zip(*valid_monitored)) print(" epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time)) # End training if needed if patience == max_patience or epoch == num_epochs - 1: break # Anneal the learning rate lr.set_value( np.array(lr.get_value() * learning_rate_annealing, dtype="float32")) # End training with a final monitoring step on the best model print("Ending training") # Load best model with np.load(os.path.join(save_path, 'dietnet_best.npz')) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] nlayers = len( lasagne.layers.get_all_params(filter(None, nets) + [discrim_net])) #lasagne.layers.set_all_param_values(filter(None, nets) + # [discrim_net], # param_values[:nlayers]) params = lasagne.layers.get_all_params( filter(None, nets) + [discrim_net]) for p, v in zip(params, param_values[:nlayers]): # Do not overwrite embedding value with old embedding. Removing # the following condition will prevent a trained model from being # tested on a different dataset if p.name != "feat_emb": p.set_value(v) if embedding_source is None: # Save embedding pred = pred_feat_emb() np.savez(os.path.join(save_path, 'feature_embedding.npz'), pred) # Training set results train_minibatches = mlh.iterate_minibatches(x_train, y_train, batch_size, shuffle=False) train_err = mlh.monitoring(train_minibatches, "train", val_fn, monitor_labels, prec_recall_cutoff) # Validation set results valid_minibatches = mlh.iterate_minibatches(x_valid, y_valid, batch_size, shuffle=False) valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn, monitor_labels, prec_recall_cutoff) # Test set results if y_test is not None: test_minibatches = mlh.iterate_minibatches(x_test, y_test, 138, shuffle=False) test_err = mlh.monitoring(test_minibatches, "test", val_fn, monitor_labels, prec_recall_cutoff) # Test the model's accuracy with varying levels of provided SNPs test_minibatches = mlh.iterate_minibatches(x_test, y_test, 138, shuffle=False) mlh.eval_prediction(test_minibatches, "test (rescaled)", predict_from_normed_inps, norm_mus, norm_sigmas, nb_evals=1, rescale_inputs=True) # Save the model's test predictions to file print(x_test.shape) test_predictions = [] for minibatch in mlh.iterate_testbatches(x_test, 1, shuffle=False): test_predictions += [predict(minibatch)] print(len(test_predictions)) print(sum([t.shape[0] for t in test_predictions])) np.savez(os.path.join(save_path, 'test_predictions.npz'), test_predictions) # Get the scores assigned by the model to each class for each test sample test_scores = [] for minibatch in mlh.iterate_testbatches(x_test, 1, shuffle=False): test_scores += [predict_scores(minibatch)] np.savez(os.path.join(save_path, 'test_scores.npz'), test_scores) # Generate new SNP embeddings using test examples labeled according # to the model's predictions if bootstrap_snp_embeddings: if bootstrap_cutoff == "soft": bootstrap_snp_data = np.hstack( (x_train.transpose(), x_valid.transpose(), x_test.transpose())) bootstrap_labels = np.vstack( (y_train, y_valid, np.array(test_scores)[:, 0, :])) filename_genotypic = 'bootstrap_gen_snp_embeddings_softlabels.npy' filename_allelic = 'bootstrap_all_snp_embeddings_softlabels.npy' else: # Hard cutoff sure_test_idxs = np.argwhere( (np.array(test_scores)[:, 0, :] > bootstrap_cutoff).sum(1)).flatten() sure_test_inputs = x_test[sure_test_idxs] sure_test_preds = np.array(test_scores)[sure_test_idxs, 0].argmax(1) bootstrap_snp_data = np.hstack( (x_train.transpose(), x_valid.transpose(), sure_test_inputs.transpose())) bootstrap_labels = np.hstack( (y_train.argmax(1), y_valid.argmax(1), sure_test_preds)) filename_genotypic = 'bootstrap_gen_snp_embeddings_cutoff%f.npy' % bootstrap_cutoff filename_allelic = 'bootstrap_all_snp_embeddings_cutoff%f.npy' % bootstrap_cutoff utils_helpers.generate_snp_hist( bootstrap_snp_data, bootstrap_labels, label_names=label_names, perclass=True, sum_to_one=True, filename_genotypic=os.path.join(save_path, filename_genotypic), filename_allelic=os.path.join(save_path, filename_allelic)) # Print all final errors for train, validation and test print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training)) # Analyse the model gradients to determine the influence of each SNP on # each of the model's prediction print(label_names) class_idx = T.iscalar("class index") grad_fn = theano.function([input_var_sup, class_idx], T.grad(prediction_sup_det[:, class_idx].mean(), input_var_sup).mean(0)) grads_wrt_inputs = mlh.get_grads_wrt_inputs(x_test, grad_fn, feature_names, label_names) # Obtain function that takes as inputs normed inputs and returns the # gradient of a class score wrt the normed inputs themselves (this is # requird because computing the integrated gradients requires to be able # to interpolate between an example where all features are missing and an # example where any number of features are provided) grad_from_normed_fn = theano.function( [normed_input_sup, class_idx], T.grad(prediction_sup_det[:, class_idx].sum(), normed_input_sup).mean(0)) # Collect integrated gradients over the whole test set. Obtain, for each # SNP, for each possible value (0, 1 or 2), the average contribution of that # value for what SNP to the score of each class. avg_int_grads = np.zeros((x_test.shape[1], 3, len(label_names)), dtype="float32") counts_int_grads = np.zeros((x_test.shape[1], 3), dtype="int32") for test_idx in range(x_test.shape[0]): int_grads = mlh.get_integrated_gradients(x_test[test_idx], grad_from_normed_fn, feature_names, label_names, norm_mus, norm_sigmas, m=100) snp_value_mask = np.arange(3) == x_test[test_idx][:, None] avg_int_grads += snp_value_mask[:, :, None] * int_grads.transpose()[:, None, :] counts_int_grads += snp_value_mask avg_int_grads = avg_int_grads / counts_int_grads[:, :, None] # Save all the additional information required for model analysis : # - Test predictions # - SNP IDs # - Subject IDs # - Normalization parameters for the input minibatches np.savez(os.path.join(save_path, 'additional_data.npz'), test_labels=y_test, test_scores=np.array(test_scores)[:, 0], test_predictions=np.array(test_predictions)[:, 0], norm_mus=norm_mus, norm_sigmas=norm_sigmas, grads_wrt_inputs=grads_wrt_inputs, exmpl_ids_train=exmpl_ids_train, exmpl_ids_valid=exmpl_ids_valid, exmpl_ids_test=exmpl_ids_test, feature_names=feature_names, label_names=label_names, avg_int_grads=avg_int_grads) # Copy files to loadpath (only if some training has beeen done so there # is a local saved version) if save_path != save_copy and num_epochs > 0: print('Copying model and other training files to {}'.format(save_copy)) copy_tree(save_path, save_copy)
def model_setup(self, mfile=None, num_units1=128, num_units2=128, lrate=2e-3, drate=0.95, eps=1e-8, bptt_maxdepth=50, l1=0, l2=0, char_dim=None): """initialization of the 2-layer LSTM model for learning or for the generation of sequences""" # the default parameters are identical to Andrej Karpathy's # (see https://github.com/karpathy/char-rnn) # 2-layer LSTM parameters self.p = {'U1': None, 'W1': None, 'b1': None, 'U2': None, 'W2': None, 'b2': None, 'V': None, 'c': None} # learning parameters self.lp = {'lrate': lrate, # learning rate 'drate': drate, # decay rate for rmsprop 'eps': eps, # epsilon parameter for rmsprop 'bptt_maxdepth': bptt_maxdepth, # backpropagation cutoff 'l1': l1, # L1 regularization parameter 'l2': l2 # L2 regularization parameter } if mfile is not None: # loading parameters from an npz file np_init = self.load_params(mfile) num_units1 = np_init['b1'].shape[1] num_units2 = np_init['b2'].shape[1] else: if char_dim is None: if self.uchar: char_dim = len(self.uchar) else: raise Exception('prepare_input() should be run before ' + 'model_setup() unless mfile is provided') # initialize small random weights r_char_dim = np.sqrt(1./(char_dim)) r_units1 = np.sqrt(1./(num_units1)) r_units2 = np.sqrt(1./(num_units2)) def uniform(rng, shape): return np.random.uniform(-rng, rng, shape).astype(theano.config.floatX) def randn(rng, shape): return np.random.uniform(-rng, rng, shape).astype(theano.config.floatX) def bias_hack(num_units): b = np.zeros((4, num_units)) b[0] = 1. # forget gate hack # helps the network remember information return b.astype(theano.config.floatX) def zeros(shape): return np.zeros(shape).astype(theano.config.floatX) def ones(shape): return np.ones(shape).astype(theano.config.floatX) # parameters for the gates # [0]: forget # [1]: input # [2]: output # [3]: cell state update np_init = {} # first layer np_init['U1'] = uniform(r_char_dim, (4, num_units1, char_dim)) np_init['W1'] = uniform(r_units1, (4, num_units1, num_units1)) np_init['b1'] = bias_hack(num_units1) # second layer np_init['U2'] = uniform(r_units1, (4, num_units2, num_units1)) np_init['W2'] = uniform(r_units2, (4, num_units2, num_units2)) np_init['b2'] = bias_hack(num_units2) # parameters for the last layer (cell output -> network output) np_init['V'] = uniform(r_units2, (char_dim, num_units2)) np_init['c'] = zeros(char_dim) # dynamical learning rate (in case the user wants to modify it # during the learning process) if theano.config.floatX == 'float32': dyn_lrate_init = np.float32(self.lp['lrate']) else: dyn_lrate_init = np.float64(self.lp['lrate']) self.dyn_lrate = theano.shared(dyn_lrate_init, name='dyn_lrate') # parameters for rmsprop (running average of gradients) msq_g = {} for param in self.p: msq_g[param] = theano.shared(zeros(np_init[param].shape), name='msq_g'+param) for param in self.p: self.p[param] = theano.shared(np_init[param], name=param) if self.batch_size > 1: x = T.imatrix('x') y = T.btensor3('y') else: x = T.ivector('x') y = T.bmatrix('y') def forward_prop(x, ht1m1, Ct1m1, ht2m1, Ct2m1, U1, W1, b1, U2, W2, b2, V, c): # defines each time step of the RNN model if self.batch_size > 1: # transform into column vectors col_b1 = b1.dimshuffle((0,1,'x')) col_b2 = b2.dimshuffle((0,1,'x')) col_c = c.dimshuffle((0,'x')) else: col_b1 = b1 col_b2 = b2 col_c = c # layer 1 gates1 = [] for i in xrange(3): # forget, input and output gates gates1.append(T.nnet.sigmoid(U1[i][:,x] + W1[i].dot(ht1m1) + col_b1[i])) tentative_Ct1 = T.tanh(U1[3][:,x] + W1[3].dot(ht1m1) + col_b1[3]) Ct1 = Ct1m1 * gates1[0] + tentative_Ct1 * gates1[1] ht1 = gates1[2] * T.tanh(Ct1) # layer 2 gates2 = [] for i in xrange(3): # forget, input and output gates gates2.append(T.nnet.sigmoid(U2[i].dot(ht1) + W2[i].dot(ht2m1) + col_b2[i])) tentative_Ct2 = T.tanh(U2[3].dot(ht1) + W2[3].dot(ht2m1) + col_b2[3]) Ct2 = Ct2m1 * gates2[0] + tentative_Ct2 * gates2[1] ht2 = gates2[2] * T.tanh(Ct2) # final layer o = T.nnet.softmax((V.dot(ht2) + col_c).T) return [o, ht1, Ct1, ht2, Ct2] if self.batch_size > 1: ht1_Ct1_size = (num_units1, self.batch_size) ht2_Ct2_size = (num_units2, self.batch_size) else: ht1_Ct1_size = num_units1 ht2_Ct2_size = num_units2 [o, ht1, Ct1, ht2, Ct2], updates = theano.scan( fn=forward_prop, sequences=x, outputs_info=[None, T.zeros(ht1_Ct1_size), T.zeros(ht1_Ct1_size), T.zeros(ht2_Ct2_size), T.zeros(ht2_Ct2_size) ], non_sequences=[self.p['U1'], self.p['W1'], self.p['b1'], self.p['U2'], self.p['W2'], self.p['b2'], self.p['V'], self.p['c']], truncate_gradient=self.lp['bptt_maxdepth'], strict=True) # o is a (seq_len, batch_size, char_dim) tensor---even if batch_size=1 prediction = T.argmax(o, axis=2) self.theano_predict = theano.function( inputs=[x], outputs=[o, prediction], ) if mfile is not None: # not here for learning; we can stop here return # compute the cross-entropy loss xent = (-y*T.log(o)).sum(axis=2) # (string_len, batch_size) matrix cost = T.mean(xent) # regularization using L1 and/or L2 norms reg_cost = cost # cast into theano.config.floatX is a trick to avoid float64 below tot_shape = (xent.shape[0] * xent.shape[1]).astype(theano.config.floatX) for param in self.p: if l1 > 0: # L1 regularization reg_cost += l1 * T.sum(abs(self.p[param])) / tot_shape if l2 > 0: # L2 regularization reg_cost += l2 * T.sum(self.p[param] ** 2) / tot_shape g = {} for param in self.p: g[param] = T.grad(reg_cost, self.p[param]) # for rmsprop new_msq_g = {} updates = {} rmsprop_updates = [] sgd_updates = [] ratios = {} for param in self.p: new_msq_g[param] = (self.lp['drate'] * msq_g[param] + (1. - self.lp['drate']) * g[param]**2) updates[param] = (self.dyn_lrate * g[param] / (T.sqrt(new_msq_g[param]) + self.lp['eps'])) # update to parameter scale ratio ratios[param] = (T.flatten(updates[param]).norm(2) / T.flatten(self.p[param]).norm(2)) sgd_updates.append((self.p[param], self.p[param] - self.dyn_lrate * g[param])) rmsprop_updates.append((self.p[param], self.p[param] - updates[param])) rmsprop_updates.append((msq_g[param], new_msq_g[param])) # todo: add possibility to clip gradients to some value f_out = [cost, prediction] # compute cost and prediction but do not update the weights self.theano_check = theano.function( inputs=[x, y], outputs=f_out, ) f_out.extend([ratios['U1'], ratios['W1'], ratios['b1'], ratios['U2'], ratios['W2'], ratios['b2'], ratios['V'], ratios['c']]) # mini-batch training with rmsprop self.theano_train_rmsprop = theano.function( inputs=[x, y], outputs=f_out, updates=rmsprop_updates ) # mini-batch training with stochastic gradient descent self.theano_train_sgd = theano.function( inputs=[x, y], outputs=f_out, updates=sgd_updates )