def _get_input_tensor_variables(self): # x_w: 1D: batch, 2D: n_prds, 3D: n_words, 4D: 5 + window; elem=word id # x_p: 1D: batch, 2D: n_prds, 3D: n_words; elem=posit id # y: 1D: batch, 2D: n_prds, 3D: n_words; elem=label id if self.argv.mark_phi: return [T.itensor4('x_w'), T.itensor3('x_p'), T.itensor3('y')] return [T.itensor4('x_w'), T.itensor3('y')]
def build(word_embeddings, len_voc, word_emb_dim, args, freeze=False): # input theano vars posts = T.imatrix() post_masks = T.fmatrix() ques_list = T.itensor4() ques_masks_list = T.ftensor4() ans_list = T.itensor4() ans_masks_list = T.ftensor4() labels = T.imatrix() N = args.no_of_candidates post_out, post_lstm_params = build_lstm(posts, post_masks, args.post_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size) ques_out, ques_lstm_params = build_list_lstm_multiqa(ques_list, ques_masks_list, N, args.ques_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size) ans_out, ans_lstm_params = build_list_lstm_multiqa(ans_list, ans_masks_list, N, args.ans_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size) pq_out, pq_a_loss, ques_squared_errors, pq_a_squared_errors, post_ques_dense_params = answer_model(post_out, ques_out, ans_out, labels, args) #pa_preds, pa_loss, post_ans_dense_params = utility_calculator(post_out, ans_out, labels, args) pa_preds, pa_loss, post_ans_dense_params = utility_calculator(post_out, ques_out, ans_out, labels, args) all_params = post_lstm_params + ques_lstm_params + ans_lstm_params + post_ques_dense_params + post_ans_dense_params loss = pq_a_loss + pa_loss loss += args.rho * sum(T.sum(l ** 2) for l in all_params) updates = lasagne.updates.adam(loss, all_params, learning_rate=args.learning_rate) train_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \ [loss, pq_a_loss, pa_loss] + pq_out + pq_a_squared_errors + ques_squared_errors + pa_preds, updates=updates) test_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \ [loss, pq_a_loss, pa_loss] + pq_out + pq_a_squared_errors + ques_squared_errors + pa_preds,) return train_fn, test_fn
def build_pqa_model(word_embeddings, len_voc, word_emb_dim, N, args, freeze=False): # input theano vars posts = T.imatrix() post_masks = T.fmatrix() ques_list = T.itensor4() ques_masks_list = T.ftensor4() ans_list = T.itensor4() ans_masks_list = T.ftensor4() labels = T.imatrix() post_out, post_lstm_params = build_lstm_posts(posts, post_masks, args.post_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size) ques_out, ques_lstm_params = build_lstm(ques_list, ques_masks_list, N, args.ques_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size) ans_out, ans_lstm_params = build_lstm(ans_list, ans_masks_list, N, args.ans_max_len, \ word_embeddings, word_emb_dim, args.hidden_dim, len_voc, args.batch_size) pqa_preds, post_ques_ans_dense_params = get_pqa_preds( post_out, ques_out, ans_out, N, args) loss = 0.0 for i in range(N): loss += T.sum( lasagne.objectives.binary_crossentropy(pqa_preds[i * N + i], labels[:, i])) # squared_errors = [None]*(N*N) # for i in range(N): # for j in range(N): # squared_errors[i*N+j] = lasagne.objectives.squared_error(ans_out[i][0], ans_out[i][j]) all_params = post_lstm_params + ques_lstm_params + ans_lstm_params + post_ques_ans_dense_params loss += args.rho * sum(T.sum(l**2) for l in all_params) updates = lasagne.updates.adam(loss, all_params, learning_rate=args.learning_rate) train_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \ [loss] + pqa_preds, updates=updates) dev_fn = theano.function([posts, post_masks, ques_list, ques_masks_list, ans_list, ans_masks_list, labels], \ [loss] + pqa_preds,) return train_fn, dev_fn
def make_node(self, x, x2, x3, x4, x5): # check that the theano version has support for __props__. # This next line looks like it has a typo, # but it's actually a way to detect the theano version # is sufficiently recent to support the use of __props__. assert hasattr(self, '_props'), "Your version of theano is too old to support __props__." x = tensor.as_tensor_variable(x) x2 = tensor.as_tensor_variable(x2) x3 = tensor.as_tensor_variable(x3) x4 = tensor.as_tensor_variable(x4) x5 = tensor.as_tensor_variable(x5) if prm.att_doc: if prm.compute_emb: td = tensor.itensor4().type() else: td = tensor.ftensor4().type() tm = tensor.ftensor3().type() else: if prm.compute_emb: td = tensor.itensor3().type() else: td = tensor.ftensor3().type() tm = tensor.fmatrix().type() return theano.Apply(self, [x,x2,x3,x4,x5], [td, tm, \ tensor.fmatrix().type(), tensor.ivector().type()])
def create_theano_function(word_embed, char_embed, values=None): char_x = T.itensor4('char_x') word_x = T.itensor3('word_x') word_mask = T.tensor3('word_mask') sent_mask = T.matrix('sent_mask') doc_linguistic_x = T.matrix('doc_linguistic') label_y = T.ivector('label_y') att_out, network_output, loss = fn.build_fn(word_x=word_x, char_x=char_x, word_mask=word_mask, sent_mask=sent_mask, label_y=label_y, word_embed=word_embed, char_embed=char_embed, args=args, doc_ling=doc_linguistic_x) if values is not None: lasagne.layers.set_all_param_values(network_output, values, trainable=True) params = lasagne.layers.get_all_params(network_output, trainable=True) if args.optimizer == 'sgd': updates = lasagne.updates.sgd(loss, params, args.learning_rate) elif args.optimizer == 'momentum': updates = lasagne.updates.momentum(loss, params, args.learning_rate) train_fn = theano.function([word_x, char_x, word_mask, sent_mask, doc_linguistic_x, label_y], loss, updates=updates) prediction = lasagne.layers.get_output(network_output, deterministic=True) eval_fn = theano.function([word_x, char_x, word_mask, sent_mask, doc_linguistic_x], prediction) fn_check_attention = theano.function([word_x, char_x, word_mask, sent_mask], att_out) return fn_check_attention, eval_fn, train_fn, params
def make_node(self, x, x2, x3, x4, x5): # check that the theano version has support for __props__. # This next line looks like it has a typo, # but it's actually a way to detect the theano version # is sufficiently recent to support the use of __props__. assert hasattr( self, '_props' ), "Your version of theano is too old to support __props__." x = tensor.as_tensor_variable(x) x2 = tensor.as_tensor_variable(x2) x3 = tensor.as_tensor_variable(x3) x4 = tensor.as_tensor_variable(x4) x5 = tensor.as_tensor_variable(x5) if prm.att_doc: if prm.compute_emb: td = tensor.itensor4().type() else: td = tensor.ftensor4().type() tm = tensor.ftensor3().type() else: if prm.compute_emb: td = tensor.itensor3().type() else: td = tensor.ftensor3().type() tm = tensor.fmatrix().type() return theano.Apply(self, [x,x2,x3,x4,x5], [td, tm, \ tensor.fmatrix().type(), tensor.ivector().type()])
def ndim_itensor(ndim, name=None): if ndim == 2: return T.imatrix(name) elif ndim == 3: return T.itensor3(name) elif ndim == 4: return T.itensor4(name) return T.imatrix(name=name)
def build_image_only_network(d_word, d_hidden, lr, eps=1e-6): # input theano vars in_context_fc7 = T.tensor3(name='context_images') in_context_bb = T.tensor4(name='context_bb') in_bbmask = T.tensor3(name='bounding_box_mask') in_context = T.itensor4(name='context') in_cmask = T.tensor4(name='context_mask') in_answer_fc7 = T.matrix(name='answer_images') in_answer_bb = T.matrix(name='answer_bb') in_answers = T.itensor3(name='answers') in_amask = T.tensor3(name='answer_mask') in_labels = T.imatrix(name='labels') # define network l_context_fc7 = lasagne.layers.InputLayer(shape=(None, 3, 4096), input_var=in_context_fc7) l_answers = lasagne.layers.InputLayer(shape=(None, 3, max_words), input_var=in_answers) l_amask = lasagne.layers.InputLayer(shape=l_answers.shape, input_var=in_amask) # contexts and answers should share embeddings l_answer_emb = lasagne.layers.EmbeddingLayer(l_answers, len_voc, d_word) l_context_proj = lasagne.layers.DenseLayer( l_context_fc7, num_units=d_hidden, nonlinearity=lasagne.nonlinearities.rectify, num_leading_axes=2) l_context_final_reps = lasagne.layers.LSTMLayer(l_context_proj, num_units=d_hidden, only_return_final=True) l_ans_reps = SumAverageLayer([l_answer_emb, l_amask], compute_sum=True, num_dims=3) l_scores = InnerProductLayer([l_context_final_reps, l_ans_reps]) preds = lasagne.layers.get_output(l_scores) loss = T.mean(lasagne.objectives.categorical_crossentropy( preds, in_labels)) all_params = lasagne.layers.get_all_params(l_scores, trainable=True) updates = lasagne.updates.adam(loss, all_params, learning_rate=lr) train_fn = theano.function([ in_context_fc7, in_context_bb, in_bbmask, in_context, in_cmask, in_answer_fc7, in_answer_bb, in_answers, in_amask, in_labels ], loss, updates=updates, on_unused_input='warn') pred_fn = theano.function([ in_context_fc7, in_context_bb, in_bbmask, in_context, in_cmask, in_answer_fc7, in_answer_bb, in_answers, in_amask ], preds, on_unused_input='warn') return train_fn, pred_fn, l_scores
def train_model_res_V1(results_path, fine_tune=False, batch_size=5, base_lr=0.001, n_epochs=30): ftensor5 = T.TensorType('float32', (False,)*5) x = ftensor5() y = T.itensor4('y') network, params, l2_penalty = build_res_V1(x, batch_size) train_cost = [] if fine_tune is True: # Fine tune the model if this flag is True with np.load(os.path.join(results_path,'params.npz')) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] set_all_param_values(network['output'], param_values[0]) print 'initialization done!' prediction = get_output(network['output']) loss_layer = LogisticRegression(prediction) cost_output = loss_layer.negative_log_likelihood(y) lamda=0.0001 cost = cost_output + lamda * l2_penalty updates = lasagne.updates.adadelta(cost, params) train = theano.function([x, y], [cost, cost_output], updates=updates) print 'function graph done!' itr = 0 test_min = np.inf train_cost = [] data_folder = '/DATA/PATH' file_name = results_path + "/log_loss.txt" fw = codecs.open(file_name, "w", "utf-8-sig") for train_x, train_y in load_train_negative(batch_size=batch_size, n_epochs=n_epochs, patchSize=[48,48,16]): print 'train_x shape: {}, positive percentage: {}'.format(train_x.shape, np.mean(train_y)) n_train_batches = train_x.shape[0] / batch_size for minibatch_index in xrange(n_train_batches): train_x_itr = train_x[minibatch_index*batch_size:(minibatch_index+1)*batch_size,:,:,:] train_y_itr = train_y[minibatch_index*batch_size:(minibatch_index+1)*batch_size,:,:,:] train_cost_itr, train_cost_itr_classify = train(train_x_itr, train_y_itr) train_cost.append([train_cost_itr,train_cost_itr_classify]) print 'model: {}, itr: {}, train loss overall: {}, train loss classify: {}'.format('resV1', itr, train_cost_itr, train_cost_itr_classify) print >> fw, 'model: {}, itr: {}, train loss overall: {}, train loss classify: {}'.format('resV1', itr, train_cost_itr, train_cost_itr_classify) itr = itr + 1 if itr % 200 == 0: np.savez(os.path.join(results_path, 'params_'+str(itr)+'.npz'), get_all_param_values(network['output'])) print 'save model done ...' fw.close()
def attention(): # q = T.fmatrix('q') # C = T.ftensor4('C') q = T.imatrix('q') C = T.itensor4('C') d = 2 W1_c = theano.shared(np.random.randint(-3, 3, (d, d))) # W1_c = theano.shared(np.ones((d, d), dtype='int32')) W1_h = theano.shared(np.random.randint(-3, 3, (d, d))) # W1_h = theano.shared(np.ones((d, d), dtype='int32')) w = theano.shared(np.ones((d,), dtype='float32')) W2_r = theano.shared(np.random.randint(-1, 1, (d, d))) W2_h = theano.shared(np.random.randint(-1, 1, (d, d))) # W2_r = theano.shared(np.ones((d, d), dtype='float32')) # W2_h = theano.shared(np.ones((d, d), dtype='float32')) # q_in = np.asarray([[1, 2], [3, 4], [-1, -2], [-3, -4]], dtype='int32') q_in = np.asarray([[1, 2]], dtype='int32') # q_in = np.asarray([[1, 2], [3, 4], [5, 6], [7, 8]], dtype='float32') C_in = np.ones((1, 3, 3, 2), dtype='int32') # C_in = np.ones((4, 3, 3, 2), dtype='int32') # C_in = np.asarray(np.random.randint(-2, 2, (1, 3, 3, 2)), dtype='int32') def forward(h_before, _C, eps=1e-8): # C: n_queries * n_cands * n_words * dim_h # h: n_queries * dim_h # M = T.tanh(T.dot(_C, W1_c) + T.dot(h_before, W1_h).dimshuffle(0, 'x', 'x', 1)) M = T.dot(_C, W1_c) + T.dot(h_before, W1_h).dimshuffle(0, 'x', 'x', 1) # 4 * 3 * 3 * 2 # M = T.dot(h_before, W1_h).dimshuffle(0, 'x', 'x', 1) # batch * len * 1 alpha = T.exp(T.dot(M, w)) # 4 * 3 * 3 # alpha = T.nnet.softmax(T.dot(M, w)) # 4 * 3 * 3 alpha /= T.sum(alpha, axis=2, keepdims=True) + eps # alpha = alpha.reshape((alpha.shape[0], alpha.shape[1], 1)) alpha = alpha.reshape((alpha.shape[0], alpha.shape[1], alpha.shape[2], 1)) # batch * d # r = T.sum(_C * alpha, axis=1) r_in = _C * alpha r = T.sum(r_in, axis=1) # 4 * 3 * 2 # batch * d h_after = T.dot(r, W2_r) + T.dot(h_before, W2_h).dimshuffle((0, 'x', 1)) # 4 * 3 * 2 # return h_after return h_after, r, alpha, M y, a, b, m = forward(q, C) f = theano.function(inputs=[q, C], outputs=[y, a, b, m], on_unused_input='ignore') print f(q_in, C_in)
def main(data_path, model_path, save_path): print("Preparing Data...") # Load data and dictionary X = [] with io.open(data_path,'r',encoding='utf-8') as f: for line in f: X.append(line.rstrip('\n')) with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) n_char = len(chardict.keys()) + 1 # Prepare data for encoding batches = Batch(X) # Load model print("Loading model params...") params = load_params('%s/model.npz' % model_path) # Build encoder print("Building encoder...") # Theano variables tweet = T.itensor4() t_mask = T.ftensor3() # Embeddings emb_t = char2word2vec(tweet, t_mask, params, n_char)[0] # Theano function f_enc = theano.function([tweet, t_mask], emb_t) # Encode print("Encoding data...") print("Input data {} samples".format(len(X))) features = np.zeros((len(X),SDIM), dtype='float32') it = 0 for x,i in batches: if it % 100 == 0: print("Minibatch {}".format(it)) it += 1 xp, x_mask = prepare_data_c2w2s(x, chardict) ff = f_enc(xp, x_mask) for ind, idx in enumerate(i): features[idx] = ff[ind] # Save with open(save_path, 'w') as o: np.save(o, features)
def __init__(self, name = "CIFAR10.pixelCNN", input_dim = 3, dims = 32, q_levels = 256, layers = 3, grad_clip = 1): # self.model = Model(name = model_name) self.name = name self.grad_clip = grad_clip self.is_train = T.scalar() self.X = T.tensor4('X') # shape: (batchsize, channels, height, width) self.X_r = T.itensor4('X_r') # print self.X.shape # return self.X_transformed = self.X_r.dimshuffle(0,2,3,1) self.input_layer = WrapperLayer(self.X.dimshuffle(0,2,3,1)) # input reshaped to (batchsize, height, width,3) self.q_levels = q_levels self.pixel_CNN = pixelConv( self.input_layer, input_dim, dims, Q_LEVELS = q_levels, name = self.name + ".pxCNN", num_layers = layers, ) print "done1" self.params = self.pixel_CNN.get_params() self.output_probab = Softmax(self.pixel_CNN).output() print "done2" self.cost = T.nnet.categorical_crossentropy( self.output_probab.reshape((-1,self.output_probab.shape[self.output_probab.ndim - 1])), self.X_r.flatten() ).mean() self.output_image = sample_from_softmax(self.output_probab) print "done3" grads = T.grad(self.cost, wrt=self.params, disconnected_inputs='warn') self.grads = [T.clip(g, floatX(-grad_clip), floatX(grad_clip)) for g in grads] print "done5" # learning_rate = T.scalar('learning_rate') self.updates = lasagne.updates.adam(self.grads, self.pixel_CNN.get_params(), learning_rate = 1e-3) print "d6" self.train_fn = theano.function([self.X, self.X_r], self.cost, updates = self.updates) print "done4" self.valid_fn = theano.function([self.X, self.X_r], self.cost) print "go to hell" self.generate_routine = theano.function([self.X], self.output_image) self.errors = {'training' : [], 'validation' : []} print "yo"
def add_datasets_to_graph(list_of_datasets, list_of_names, graph, strict=True, list_of_test_values=None): assert type(graph) is OrderedDict datasets_added = [] for n, (dataset, name) in enumerate(safe_zip(list_of_datasets, list_of_names)): if dataset.dtype != "int32": if len(dataset.shape) == 1: sym = tensor.vector() elif len(dataset.shape) == 2: sym = tensor.matrix() elif len(dataset.shape) == 3: sym = tensor.tensor3() elif len(dataset.shape) == 4: sym = tensor.tensor4() else: raise ValueError("dataset %s has unsupported shape" % name) elif dataset.dtype == "int32": if len(dataset.shape) == 1: sym = tensor.ivector() elif len(dataset.shape) == 2: sym = tensor.imatrix() elif len(dataset.shape) == 3: sym = tensor.itensor3() elif len(dataset.shape) == 4: sym = tensor.itensor4() else: raise ValueError("dataset %s has unsupported shape" % name) else: raise ValueError("dataset %s has unsupported dtype %s" % ( name, dataset.dtype)) if list_of_test_values is not None: sym.tag.test_value = list_of_test_values[n] tag_expression(sym, name, dataset.shape) datasets_added.append(sym) if DATASETS_ID not in graph.keys(): graph[DATASETS_ID] = [] graph[DATASETS_ID] += datasets_added if len(list_of_datasets) == 1: # Make it easier if you only added a single dataset datasets_added = datasets_added[0] return datasets_added
def char_hierarchical_doc_fn(args, word_embed, char_embed, values=None): char_x = T.itensor4('char_x') word_x = T.itensor3('word_x') word_mask = T.tensor3('word_mask') sent_mask = T.matrix('sent_mask') doc_linguistic_x = T.matrix('doc_linguistic') label_y = T.ivector('label_y') char_input_layer = lasagne.layers.InputLayer(shape=(None, args.max_sent, args.max_word, args.max_char), input_var=char_x) word_input_layer = lasagne.layers.InputLayer(shape=(None, args.max_sent, args.max_word), input_var=word_x) word_mask_layer = lasagne.layers.InputLayer(shape=(None, args.max_sent, args.max_word), input_var=word_mask) word_mask_layer = lasagne.layers.reshape(word_mask_layer, (-1, [2])) sent_mask_layer = lasagne.layers.InputLayer(shape=(None, args.max_sent), input_var=sent_mask) doc_linguistic_layer = lasagne.layers.InputLayer( shape=(None, args.max_ling), input_var=doc_linguistic_x) char_cnn = networks.char_cnn(char_input_layer, args.num_filter, args.conv_window, char_embed, args) word_rnn = networks.word_rnn(word_input_layer, word_mask_layer, word_embed, args, char_cnn) if args.dropout_rate > 0: word_rnn = lasagne.layers.dropout(word_rnn, p=args.dropout_rate) if args.word_att == 'avg': word_output = networks.AveragePooling(word_rnn, mask=word_mask_layer) elif args.word_att == 'last': word_output = word_rnn elif args.word_att == 'dot': word_att = lasagne.layers.DenseLayer( word_rnn, num_units=2 * args.hidden_size, num_leading_axes=-1, nonlinearity=lasagne.nonlinearities.tanh) word_att = networks.Attention(word_att, num_units=2 * args.hidden_size, mask=word_mask_layer) word_output = networks.AttOutput([word_rnn, word_att]) word_output = lasagne.layers.reshape(word_output, (-1, args.max_sent, [1])) sent_rnn = networks.sent_rnn(word_output, sent_mask_layer, args) if args.dropout_rate > 0: sent_rnn = lasagne.layers.dropout(sent_rnn, p=args.dropout_rate) sent_input = lasagne.layers.DenseLayer( sent_rnn, 2 * args.hidden_size, num_leading_axes=-1, nonlinearity=lasagne.nonlinearities.tanh) sent_att = networks.Attention(sent_input, num_units=2 * args.hidden_size, mask=sent_mask_layer) att_out = lasagne.layers.get_output(sent_att, deterministic=True) fn_check_attention = theano.function( [char_x, word_x, word_mask, sent_mask], att_out) sent_output = networks.AttOutput([sent_rnn, sent_att]) if args.doc_ling_nonlinear: doc_linguistic_layer = lasagne.layers.DenseLayer( doc_linguistic_layer, 60, num_leading_axes=-1, nonlinearity=lasagne.nonlinearities.rectify) if args.dropout_rate > 0: doc_linguistic_layer = lasagne.layers.dropout(doc_linguistic_layer, p=args.dropout_rate) sent_output = lasagne.layers.ConcatLayer( [sent_output, doc_linguistic_layer], axis=-1) network_output = lasagne.layers.DenseLayer( sent_output, num_units=1, nonlinearity=lasagne.nonlinearities.sigmoid) regularization = lasagne.regularization.regularize_layer_params( network_output, penalty=lasagne.regularization.l2) train_pred = lasagne.layers.get_output(network_output) loss = lasagne.objectives.binary_crossentropy( train_pred, label_y).mean() + regularization * 0.0001 if values is not None: lasagne.layers.set_all_param_values(network_output, values, trainable=True) params = lasagne.layers.get_all_params(network_output, trainable=True) if args.optimizer == 'sgd': updates = lasagne.updates.sgd(loss, params, args.learning_rate) elif args.optimizer == 'momentum': updates = lasagne.updates.momentum(loss, params, args.learning_rate) train_fn = theano.function( [word_x, char_x, word_mask, sent_mask, doc_linguistic_x, label_y], loss, updates=updates) prediction = lasagne.layers.get_output(network_output, deterministic=True) eval_fn = theano.function( [word_x, char_x, word_mask, sent_mask, doc_linguistic_x], prediction) return fn_check_attention, eval_fn, train_fn, params
def config_theano(self): ################################################################## ########################### NOT USING NOW ######################## ################################################################## # snapshot and change snapshot = T.itensor3('snapshot') change_label = T.fmatrix('change_label') ################################################################## ################################################################## ################################################################## # trade-off hyperparameters _lambda = 0.1 _alpha = 0.1 _avgLen = 20. # regularization and learning rate lr = T.scalar('lr') reg = T.scalar('reg') beta = T.scalar('beta') # semantics inf_trk_labels = T.fmatrix('inf_trk_labels') req_trk_labels = T.fmatrix('req_trk_labels') # DB matching degree db_degrees = T.fmatrix('db_degrees') # source and target utts source = T.imatrix('source') target = T.imatrix('target') source_len = T.ivector('source_len') target_len = T.ivector('target_len') utt_group = T.ivector('utt_group') # masked source and target utts masked_source = T.imatrix('masked_source') masked_target = T.imatrix('masked_target') masked_source_len = T.ivector('masked_source_len') masked_target_len = T.ivector('masked_target_len') # tracker features, either n-grams or delexicalised position srcfeat = T.itensor4('srcfeat') tarfeat = T.itensor4('tarfeat') # external samples success_rewards = T.fvector('success_reward') samples = T.ivector('samples') # for numerical stability epsln = 1e-10 # dialog level recurrence def dialog_recur(source_t, target_t, source_len_t, target_len_t, masked_source_t, masked_target_t, masked_source_len_t, masked_target_len_t, utt_group_t, snapshot_t, success_reward_t, sample_t, change_label_t, db_degree_t, inf_label_t, req_label_t, source_feat_t, target_feat_t, belief_tm1, masked_target_tm1, masked_target_len_tm1, target_feat_tm1, posterior_tm1): ############################################################## ##################### Intent encoder ######################### ############################################################## # Intent encoder if self.enc == 'lstm': masked_intent_t = bidirectional_encode(self.fEncoder, self.bEncoder, masked_source_t, masked_source_len_t) ############################################################## ########## Belief tracker, informable + requestable ########## ############################################################## # cost placeholder for accumulation print '\tloss function' loss_t = theano.shared(np.zeros((1), dtype=theano.config.floatX))[0] companion_loss_t = theano.shared( np.zeros((1), dtype=theano.config.floatX))[0] prior_loss_t = theano.shared( np.zeros((1), dtype=theano.config.floatX))[0] posterior_loss_t = theano.shared( np.zeros((1), dtype=theano.config.floatX))[0] base_loss_t = theano.shared( np.zeros((1), dtype=theano.config.floatX))[0] # other information to store dtmp = 1 #if self.vae_train=='sample' else self.dl reward_t = theano.shared( np.zeros((dtmp), dtype=theano.config.floatX)) baseline_t = theano.shared( np.zeros((1), dtype=theano.config.floatX))[0] posterior_t = theano.shared( np.zeros((self.dl), dtype=theano.config.floatX))[0] # Informable slot belief tracker # belief vector belief_t = [] if self.trk == 'rnn' and self.inf == True: for i in range(len(self.infotrackers)): # slice the current belief tracker output cur_belief_tm1 = belief_tm1[self.iseg[i]:self.iseg[i + 1]] if self.trkenc == 'cnn': # cnn, position features ssrcpos_js = source_feat_t[ 0, self.iseg[i]:self.iseg[i + 1], :] vsrcpos_js = source_feat_t[ 1, self.iseg[i]:self.iseg[i + 1], :] starpos_jm1s = target_feat_tm1[ 0, self.iseg[i]:self.iseg[i + 1], :] vtarpos_jm1s = target_feat_tm1[ 1, self.iseg[i]:self.iseg[i + 1], :] # tracking cur_belief_t = self.infotrackers[i].recur( cur_belief_tm1, masked_source_t, masked_target_tm1, masked_source_len_t, masked_target_len_tm1, ssrcpos_js, vsrcpos_js, starpos_jm1s, vtarpos_jm1s) # semi label cur_label_t = inf_label_t[self.iseg[i]:self.iseg[i + 1]] # include cost if training tracker if self.learn_mode == 'all' or self.learn_mode == 'trk': print '\t\tincluding informable tracker loss ...' loss_t += -T.sum( cur_label_t * T.log10(cur_belief_t + epsln)) # accumulate belief vector if self.bef == 'full': belief_t.append(cur_label_t) else: # summary belief tmp = [T.sum( cur_label_t[:-2],axis=0).dimshuffle('x'),\ cur_label_t[-2].dimshuffle('x')] tmp = tmp + [cur_label_t[-1].dimshuffle('x')] if\ self.bef=='summary' else tmp cur_sum_belief_t = T.concatenate(tmp, axis=0) belief_t.append(cur_sum_belief_t) inf_belief_t = inf_label_t # Requestable slot belief tracker if self.trk == 'rnn' and self.req == True: for i in range(len(self.rseg) - 1): # current feature index bn = self.iseg[-1] + 2 * i if self.trkenc == 'cnn': # cnn, position features ssrcpos_js = source_feat_t[0, bn, :] vsrcpos_js = source_feat_t[1, bn, :] starpos_jm1s = target_feat_tm1[0, bn, :] vtarpos_jm1s = target_feat_tm1[1, bn, :] # tracking cur_belief_t = self.reqtrackers[i].recur( masked_source_t, masked_target_tm1, masked_source_len_t, masked_target_len_tm1, ssrcpos_js, vsrcpos_js, starpos_jm1s, vtarpos_jm1s) # semi label cur_label_t = req_label_t[2 * i:2 * (i + 1)] # include cost if training tracker if self.learn_mode == 'all' or self.learn_mode == 'trk': print '\t\tincluding requestable tracker loss ...' loss_t += -T.sum( cur_label_t * T.log10(cur_belief_t + epsln)) # accumulate belief vector if self.bef == 'full': belief_t.append(cur_label_t) else: tmp = cur_label_t if self.bef == 'summary' else cur_label_t[: 1] belief_t.append(tmp) # offer-change tracker minus1 = -T.ones((1), dtype='int32') cur_belief_t = self.changeTracker.recur( masked_source_t, masked_target_tm1, masked_source_len_t, masked_target_len_tm1, minus1, minus1, minus1, minus1) # cost function if self.learn_mode == 'trk' or self.learn_mode == 'all': print '\t\tincluding OfferChange tracker loss ...' loss_t += -T.sum( change_label_t * T.log10(cur_belief_t + epsln)) # accumulate belief vector if self.bef == 'full': belief_t.append(change_label_t) else: tmp = change_label_t[:1] if self.bef=='simplified' \ else change_label_t belief_t.append(tmp) ############################################################## ######################## LSTM decoder ######################## ############################################################## bef_t = T.concatenate(belief_t, axis=0) # LSTM decoder if self.dec == 'lstm' and self.learn_mode != 'trk': prob_t, snapCost_t, prior_t, posterior_t, z_t, base_t, debugX = \ self.decoder.decode( masked_source_t, masked_source_len_t, masked_target_t, masked_target_len_t, masked_intent_t, belief_t, db_degree_t[-6:], utt_group_t, snapshot_t, sample_t) debug_t = prior_t # decoder loss if self.ply != 'latent': # deterministic policy print '\t\tincluding decoder loss ...' loss_t += -T.sum(T.log10(prob_t + epsln)) else: # variational policy # disconnet gradient flow P = G.disconnected_grad(prior_t) Q = G.disconnected_grad(posterior_t) Qtm1 = G.disconnected_grad(posterior_tm1) # prior network loss if self.learn_mode == 'rl': # rl fine-tuning print '\t\tincluding RL success reward for fine-tine policy ...' prior_loss_t = -success_reward_t * T.log10(prior_t + epsln)[z_t] else: # neural variational inference # encoder loss, minimising KL(Q|P) and self-supervised action print '\t\tinclding KL(Q|Pi) to train policy network Pi ...' prior_loss_t = -T.switch( T.lt(utt_group_t, self.dl - 1), T.log10(prior_t + epsln)[z_t], _alpha * T.sum(Q * (T.log10(prior_t + epsln) - T.log10(Q + epsln)))) # decoder loss for current sample/ground truth print '\t\tincluding decoder loss ...' loss_t = -T.sum(T.log10(prob_t + epsln)) # define reward function for Q print '\t\tincluding reinforce loss to train inference network Q ...' r_t = G.disconnected_grad( _avgLen * T.mean(T.log10(prob_t + epsln)) + # decoder loglikelihood -_lambda * T.sum(Q * (T.log10(Q + epsln) - T.log10(P + epsln))) + # KL(P|Q) -_lambda * T.sum(Qtm1 * (T.log10(Qtm1 + epsln) - T.log10(Q + epsln))) # KL(Qt|Qtm1) ) # actual reward after deducting baseline reward_t = G.disconnected_grad(r_t - base_t) baseline_t = base_t #debug_t = r_t-base_t # Q network loss: reinforce objective posterior_loss_t = -T.switch( T.lt(utt_group_t, self.dl - 1), T.log10(posterior_t + epsln)[z_t], # self-sup _alpha * reward_t * T.log10(posterior_t + epsln)[z_t] # reinforce ) # baseline loss print '\t\tincluding baseline loss ...' base_loss_t = T.switch(T.lt(utt_group_t, self.dl - 1), 0., (r_t - baseline_t)**2) # snapshot objective if self.use_snap: print '\t\tincluding decoder snapshot loss ...' companion_loss_t += -T.sum( snapCost_t[:masked_target_len_t - 1]) # dummy, TODO: change it if self.ply != 'latent': posterior_t = posterior_tm1 z_t = posterior_tm1 reward_t = posterior_tm1 prior_t = posterior_tm1 debug_t = posterior_tm1 # take the semi label for next input - like LM return inf_belief_t, masked_target_t, masked_target_len_t, \ target_feat_t, posterior_t, z_t,\ loss_t, companion_loss_t, prior_loss_t, posterior_loss_t, base_loss_t,\ reward_t, baseline_t, debug_t # initial belief state belief_0 = T.zeros((self.iseg[-1]), dtype=theano.config.floatX) belief_0 = T.set_subtensor(belief_0[[x - 1 for x in self.iseg[1:]]], 1.0) # initial target jm1 masked_target_tm1 = T.ones_like(masked_target[0]) masked_target_len_tm1 = T.ones_like(masked_target_len[0]) # initial target jm1 position features tarfeat_tm1 = -T.ones_like(tarfeat[0]) # initial posterior p0 = np.ones((self.dl)) / float(self.dl) posterior_0 = theano.shared(p0.astype(theano.config.floatX)) # Dialogue level forward propagation [_,_,_,_,posterior,sample,loss,companion_loss,prior_loss,posterior_loss,base_loss, reward,baseline,debug], updates= \ theano.scan( fn=dialog_recur, sequences=[source,target,source_len,target_len, masked_source,masked_target, masked_source_len,masked_target_len, utt_group, snapshot, success_rewards, samples, change_label, db_degrees, inf_trk_labels, req_trk_labels, srcfeat, tarfeat],\ outputs_info=[belief_0,masked_target_tm1,masked_target_len_tm1,tarfeat_tm1, posterior_0,None,None,None,None,None,None,None,None,None]) # Theano validation function self.valid = theano.function( inputs=[source, target, source_len, target_len, masked_source, masked_target, masked_source_len, masked_target_len, utt_group, snapshot, success_rewards, samples, change_label, inf_trk_labels, req_trk_labels, db_degrees, srcfeat, tarfeat],\ outputs=[loss,prior_loss,posterior],\ updates=updates,\ on_unused_input='warn') # RL validation function self.validRL = theano.function( inputs=[source, target, source_len, target_len, masked_source, masked_target, masked_source_len, masked_target_len, utt_group, snapshot, success_rewards, samples, change_label, inf_trk_labels, req_trk_labels, db_degrees, srcfeat, tarfeat],\ outputs=[prior_loss, debug],\ updates=updates,\ on_unused_input='warn') # for deterministic case, just loglikelihood if self.ply == 'attention' or self.ply == 'normal': # flatten parameters self.flatten_params = [] for k in ['inftrk', 'reqtrk', 'dec', 'ply', 'enc']: ws = self.params[k] if self.learn_mode == 'all': # train whole model print '\tgradient w.r.t %s' % (k) self.flatten_params += ws elif self.learn_mode == 'trk' and 'trk' in k: # pretrain tracker print '\tgradient w.r.t %s' % (k) self.flatten_params += ws elif self.learn_mode == 'encdec': # train * apart from tracker if 'trk' in k: continue # tracker else: print '\tgradient w.r.t %s' % (k) self.flatten_params += ws # loss function self.cost = T.sum(loss) + 0.1 * T.sum(companion_loss) # gradients and updates updates = adam(self.cost, self.flatten_params, lr=lr, reg=reg) # default value for function output prior_loss = posterior_loss = baseline_loss = self.cost # for NVI elif self.ply == 'latent': # flatten parameters self.flatten_params = [] for k in ['ply', 'enc', 'dec']: # train encoder decoder if self.learn_mode == 'encdec': print '\tgradient w.r.t %s' % (k) self.flatten_params += self.params[k] # fine-tune policy network by RL elif self.learn_mode == 'rl': if k == 'ply': print '\tgradient w.r.t %s prior network' % (k) self.flatten_params += self.params[k][7:10] # loss function if self.learn_mode == 'rl': self.cost = T.sum(prior_loss) elif self.learn_mode == 'encdec': self.cost = T.sum(loss) + 0.1*T.sum(companion_loss) +\ T.sum(prior_loss) + T.sum(posterior_loss) # gradients and updates for p, q in adam(self.cost, self.flatten_params, lr=lr, reg=reg): updates.update({p: q}) if self.learn_mode == 'encdec': # baseline objective for p, q in adam(T.sum(base_loss), self.policy.baseline.params, lr=lr * 10., reg=0.): updates.update({p: q}) self.flatten_params.extend(self.policy.baseline.params) # theano training function self.train = theano.function( inputs= [source, target, source_len, target_len, masked_source, masked_target, masked_source_len, masked_target_len, utt_group, snapshot, success_rewards, samples, change_label, inf_trk_labels, req_trk_labels, db_degrees, srcfeat, tarfeat, lr, reg],\ outputs=[loss,prior_loss,posterior_loss,base_loss, posterior,sample,reward,baseline,debug],\ updates=updates,\ on_unused_input='warn') # RL training function self.trainRL = theano.function( inputs= [source, target, source_len, target_len, masked_source, masked_target, masked_source_len, masked_target_len, utt_group, snapshot, success_rewards, samples, change_label, inf_trk_labels, req_trk_labels, db_degrees, srcfeat, tarfeat, lr, reg],\ outputs=[prior_loss,sample, debug],\ updates=updates,\ on_unused_input='warn')
def build_text_only_network(d_word, d_hidden, lr, eps=1e-6): # input theano vars in_context_fc7 = T.tensor3( name='context_images' ) # bsz x 3 x 4096 (because 3 context panels, fc7 features each of dim 4096) in_context_bb = T.tensor4( name='context_bb' ) # bsz x 3 x 3 x 4 (because 3 context panels, each contains a max of 3 speech boxes, each box described by 4 coordinates) in_bbmask = T.tensor3( name='bounding_box_mask' ) # bsz x 3 x 3 (because 3 context panels, each contains a max of 3 speech boxes, the mask has an entry of 1 in the ith position if the panel contains the ith speech box) in_context = T.itensor4( name='context' ) # bsz x 3 x 3 x 30 (because 3 context panels, each contains a max of 3 speech boxes, each box contains speech with a max of 30 words) in_cmask = T.tensor4( name='context_mask' ) # bsz x 3 x 3 x 30 (because 3 context panels, each contains a max of 3 speech boxes, each box contains speech with a max of 30 words, where the mask has an entry of 1 in the ith position if the ith word exists in the speech) in_answer_fc7 = T.matrix( name='answer_images' ) # bsz x 4096 (fc7 feature for the panel for which we want to guess the speech) in_answer_bb = T.matrix( name='answer_bb' ) # bsz x 4 (the answer panel has one speech box described by 4 coordinates) in_answers = T.itensor3( name='answers' ) # bsz x 3 x 30 (3 candidate answers each of max 30 words ) in_amask = T.tensor3( name='answer_mask' ) # bsz x 3 x 30 (mask for 3 candidates answers, ie, an entry of 1 in the ith position if the ith word exists in the candidate) in_labels = T.imatrix( name='labels' ) # bsz x 3 (out of 3 candidate answers, the correct answer will have a 1) # define network l_context_fc7 = lasagne.layers.InputLayer(shape=(None, 3, 4096), input_var=in_context_fc7) l_answer_fc7 = lasagne.layers.InputLayer(shape=(None, 4096), input_var=in_answer_fc7) l_context = lasagne.layers.InputLayer(shape=(None, max_panels, max_boxes, max_words), input_var=in_context) l_answers = lasagne.layers.InputLayer(shape=(None, 3, max_words), input_var=in_answers) l_cmask = lasagne.layers.InputLayer(shape=l_context.shape, input_var=in_cmask) l_amask = lasagne.layers.InputLayer(shape=l_answers.shape, input_var=in_amask) l_bbmask = lasagne.layers.InputLayer(shape=(None, 3, max_boxes), input_var=in_bbmask) # contexts and answers should share embeddings l_context_emb = lasagne.layers.EmbeddingLayer(l_context, len_voc, d_word, name='word_emb') l_answer_emb = lasagne.layers.EmbeddingLayer(l_answers, len_voc, d_word, W=l_context_emb.W) l_context_box_reps = SumAverageLayer([l_context_emb, l_cmask], compute_sum=True, num_dims=4) l_box_reshape = lasagne.layers.ReshapeLayer(l_context_box_reps, (-1, max_boxes, d_word)) l_bbmask_reshape = lasagne.layers.ReshapeLayer(l_bbmask, (-1, max_boxes)) l_box_lstm = lasagne.layers.LSTMLayer(l_box_reshape, num_units=d_word, mask_input=l_bbmask_reshape, only_return_final=True) l_context_panel_reps = lasagne.layers.ReshapeLayer(l_box_lstm, (-1, 3, d_word)) l_context_final_reps = lasagne.layers.LSTMLayer(l_context_panel_reps, num_units=d_word, only_return_final=True) l_ans_reps = SumAverageLayer([l_answer_emb, l_amask], compute_sum=True, num_dims=3) l_scores = InnerProductLayer([l_context_final_reps, l_ans_reps]) preds = lasagne.layers.get_output(l_scores) loss = T.mean(lasagne.objectives.categorical_crossentropy( preds, in_labels)) all_params = lasagne.layers.get_all_params(l_scores, trainable=True) updates = lasagne.updates.adam(loss, all_params, learning_rate=lr) train_fn = theano.function([ in_context_fc7, in_context_bb, in_bbmask, in_context, in_cmask, in_answer_fc7, in_answer_bb, in_answers, in_amask, in_labels ], loss, updates=updates, on_unused_input='warn') pred_fn = theano.function([ in_context_fc7, in_context_bb, in_bbmask, in_context, in_cmask, in_answer_fc7, in_answer_bb, in_answers, in_amask ], preds, on_unused_input='warn') return train_fn, pred_fn, l_scores
inputs=output, mask_type=('b', 1))) output = lib.ops.conv2d.Conv2D('Dec2.Out', input_dim=DIM_PIX_2, output_dim=2 * LATENT_DIM_1, filter_size=1, inputs=output, mask_type=('b', 1), he_init=False) return output total_iters = T.iscalar('total_iters') images = T.itensor4('images') # shape: (batch size, n channels, height, width) alpha = T.minimum( 1, T.cast(total_iters, theano.config.floatX) / lib.floatX(ALPHA_ITERS)) def split(mu_and_logsig): mu, logsig = mu_and_logsig[:, ::2], mu_and_logsig[:, 1::2] logsig = T.log(T.nnet.softplus(logsig)) return mu, logsig def clamp_logsig(logsig): beta = T.minimum( 1,
def main(train_path,val_path,save_path,num_epochs=NUM_EPOCHS): # save settings shutil.copyfile('settings.py','%s/settings.txt'%save_path) print("Preparing Data...") # Training data if not RELOAD_DATA: print("Creating Pairs...") trainX = batched_tweets.create_pairs(train_path) valX = batched_tweets.create_pairs(val_path) print("Number of training pairs = {}".format(len(trainX[0]))) print("Number of validation pairs = {}".format(len(valX[0]))) with open('%s/train_pairs.pkl'%(save_path),'w') as f: pkl.dump(trainX, f) with open('%s/val_pairs.pkl'%(save_path),'w') as f: pkl.dump(valX, f) else: print("Loading Pairs...") with open(train_path,'r') as f: trainX = pkl.load(f) with open(val_path,'r') as f: valX = pkl.load(f) if not RELOAD_MODEL: # Build dictionary chardict, charcount = batched_tweets.build_dictionary(trainX[0] + trainX[1]) n_char = len(chardict.keys()) + 1 batched_tweets.save_dictionary(chardict,charcount,'%s/dict.pkl' % save_path) # params params = init_params_c2w2s(n_chars=n_char) else: print("Loading model params...") params = load_params_shared('%s/model.npz' % save_path) print("Loading dictionary...") with open('%s/dict.pkl' % save_path, 'rb') as f: chardict = pkl.load(f) n_char = len(chardict.keys()) + 1 train_iter = batched_tweets.BatchedTweets(trainX, batch_size=N_BATCH, maxlen=MAX_LENGTH) val_iter = batched_tweets.BatchedTweets(valX, batch_size=512, maxlen=MAX_LENGTH) print("Building network...") # Tweet variables tweet = T.itensor4() ptweet = T.itensor4() ntweet = T.itensor4() # masks t_mask = T.ftensor3() tp_mask = T.ftensor3() tn_mask = T.ftensor3() # Embeddings emb_t, c2w, w2s = char2word2vec(tweet, t_mask, params, n_char) emb_tp, c2w, w2s = char2word2vec(ptweet, tp_mask, params, n_char) emb_tn, c2w, w2s = char2word2vec(ntweet, tn_mask, params, n_char) # batch loss D1 = 1 - T.batched_dot(emb_t, emb_tp)/(tnorm(emb_t)*tnorm(emb_tp)) D2 = 1 - T.batched_dot(emb_t, emb_tn)/(tnorm(emb_t)*tnorm(emb_tn)) gap = D1-D2+M loss = gap*(gap>0) cost = T.mean(loss) + REGULARIZATION*lasagne.regularization.regularize_network_params(c2w, lasagne.regularization.l2) + REGULARIZATION*lasagne.regularization.regularize_network_params(w2s, lasagne.regularization.l2) cost_only = T.mean(loss) reg_only = REGULARIZATION*lasagne.regularization.regularize_network_params(c2w, lasagne.regularization.l2) + REGULARIZATION*lasagne.regularization.regularize_network_params(w2s, lasagne.regularization.l2) # params and updates print("Computing updates...") lr = LEARNING_RATE mu = MOMENTUM updates = lasagne.updates.nesterov_momentum(cost, lasagne.layers.get_all_params(w2s), lr, momentum=mu) # Theano function print("Compiling theano functions...") inps = [tweet,t_mask,ptweet,tp_mask,ntweet,tn_mask] #dist = theano.function(inps,[D1,D2]) #l = theano.function(inps,loss) cost_val = theano.function(inps,[cost_only, emb_t, emb_tp, emb_tn]) train = theano.function(inps,cost,updates=updates) reg_val = theano.function([],reg_only) # Training print("Training...") uidx = 0 try: for epoch in range(num_epochs): n_samples = 0 train_cost = 0. print("Epoch {}".format(epoch)) if USE_SCHEDULE: # schedule if epoch > 0 and epoch % 5 == 0: print("Updating Schedule...") lr = max(1e-5,lr/2) mu = mu - 0.05 updates = lasagne.updates.nesterov_momentum(cost, lasagne.layers.get_all_params(net), lr, momentum=mu) train = theano.function(inps,cost,updates=updates) ud_start = time.time() for x,y,z in train_iter: if not x: print("Minibatch with no valid triples") continue n_samples +=len(x) uidx += 1 if DEBUG and uidx > 3: sys.exit() if DEBUG: print("Tweets = {}".format(x[:5])) x, x_m, y, y_m, z, z_m = batched_tweets.prepare_data_c2w2s(x, y, z, chardict, maxwordlen=MAX_WORD_LENGTH, maxseqlen=MAX_SEQ_LENGTH, n_chars=n_char) if x==None: print("Minibatch with zero samples under maxlength.") uidx -= 1 continue if DEBUG: print("Params before update...") print_params(params) display_actv(x,x_m,y,y_m,z,z_m,inps,w2s,'before') cb, embb, embb_p, embb_n = cost_val(x,x_m,y,y_m,z,z_m) curr_cost = train(x,x_m,y,y_m,z,z_m) train_cost += curr_cost*len(x) if DEBUG: print("Params after update...") print_params(params) display_actv(x,x_m,y,y_m,z,z_m,inps,w2s,'after') ca, emba, emba_p, emba_n = cost_val(x,x_m,y,y_m,z,z_m) print("Embeddings before = {}".format(embb[:5])) print("Embeddings after = {}".format(emba[:5])) print("Cost before update = {} \nCost after update = {}".format(cb, ca)) if np.isnan(curr_cost) or np.isinf(curr_cost): print("Nan detected.") return ud = time.time() - ud_start if np.mod(uidx, DISPF) == 0: print("Epoch {} Update {} Cost {} Time {} Samples {}".format(epoch,uidx,curr_cost,ud,len(x))) if np.mod(uidx,SAVEF) == 0: print("Saving...") saveparams = OrderedDict() for kk,vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/model.npz' % save_path,**saveparams) print("Done.") print("Computing Validation Cost...") validation_cost = 0. n_val_samples = 0 for x,y,z in val_iter: if not x: print("Validation: Minibatch with no valid triples") continue n_val_samples += len(x) x, x_m, y, y_m, z, z_m = batched_tweets.prepare_data_c2w2s(x, y, z, chardict, maxwordlen=MAX_WORD_LENGTH, maxseqlen=MAX_SEQ_LENGTH, n_chars=n_char) if x==None: print("Validation: Minibatch with zero samples under maxlength") continue curr_cost, _, _, _ = cost_val(x,x_m,y,y_m,z,z_m) validation_cost += curr_cost*len(x) regularization_cost = reg_val() print("Epoch {} Training Cost {} Validation Cost {} Regularization Cost {}".format(epoch, train_cost/n_samples, validation_cost/n_val_samples, regularization_cost)) print("Seen {} samples.".format(n_samples)) for kk,vv in params.iteritems(): print("Param {} Epoch {} Max {} Min {}".format(kk, epoch, np.max(vv.get_value()), np.min(vv.get_value()))) print("Saving...") saveparams = OrderedDict() for kk,vv in params.iteritems(): saveparams[kk] = vv.get_value() np.savez('%s/model_%d.npz' % (save_path,epoch),**saveparams) print("Done.") if False: # store embeddings and data features = np.zeros((len(train_iter.data[0]),3*WDIM)) distances = np.zeros((len(train_iter.data[0]),2)) for idx, triple in enumerate(zip(train_iter.data[0],train_iter.data[1],train_iter.data[2])): x, x_m, y, y_m, z, z_m = batched_tweets.prepare_data([triple[0]], [triple[1]], [triple[2]], chardict, maxlen=MAX_LENGTH, n_chars=n_char) if x==None: continue emb1, emb2, emb3 = t2v(x,x_m,y,y_m,z,z_m) emb1 = np.reshape(emb1, (WDIM)) emb2 = np.reshape(emb2, (WDIM)) emb3 = np.reshape(emb3, (WDIM)) features[idx,:] = np.concatenate((emb1,emb2,emb3),axis=0) distances[idx,0] = 1-np.dot(emb1,emb2)/(np.linalg.norm(emb1)*np.linalg.norm(emb2)) distances[idx,1] = 1-np.dot(emb1,emb3)/(np.linalg.norm(emb1)*np.linalg.norm(emb3)) with open('debug/feat_%d.npy'%epoch,'w') as df: np.save(df,features) with open('debug/dist_%d.npy'%epoch,'w') as ds: np.save(ds,distances) if False: with open('debug/data.txt','w') as dd: for triple in zip(train_iter.data[0],train_iter.data[1],train_iter.data[2]): dd.write('%s\t%s\t%s\n' % (triple[0],triple[1],triple[2])) except KeyboardInterrupt: pass
def __init__(self, args): ''' nh :: dimension of the hidden layer nc :: number of classes ne :: number of word embeddings in the vocabulary #de :: dimension of the word embeddings cs :: word window context size ''' self.container = {} self.args = args self.args['rng'] = numpy.random.RandomState(3435) self.args['dropoutTrigger'] = args['dropoutTrigger'] if args['dropoutTrigger'] > 0. else 0. self.args['dropoutArg'] = args['dropoutArg'] if args['dropoutArg'] > 0. else 0. # parameters of the model self.container['params'], self.container['names'] = [], [] self.container['embDict'] = OrderedDict() self.container['vars'] = OrderedDict() self.container['dimIn'] = 0 print '******************FEATURES******************' for ed in self.args['features']: if self.args['features'][ed] == 0: self.container['embDict'][ed] = theano.shared(self.args['embs'][ed].astype(theano.config.floatX)) if self.args['updateEmbs']: print '@@@@@@@ Will update embedding table: ', ed self.container['params'] += [self.container['embDict'][ed]] self.container['names'] += [ed] if self.args['features'][ed] == 0: self.container['vars'][ed] = T.imatrix() dimAdding = self.args['embs'][ed].shape[1] self.container['dimIn'] += dimAdding elif self.args['features'][ed] == 1: self.container['vars'][ed] = T.tensor3() dimAdding = self.args['features_dim'][ed] self.container['dimIn'] += dimAdding if self.args['features'][ed] >= 0: print 'represetation - ', ed, ' : ', dimAdding print 'REPRESENTATION DIMENSION = ', self.container['dimIn'] if self.args['distanceFet'] == 0: self.container['embDict']['dist1'] = theano.shared(self.args['embs']['dist1'].astype(theano.config.floatX)) self.container['embDict']['dist2'] = theano.shared(self.args['embs']['dist2'].astype(theano.config.floatX)) self.container['embDict']['dist3'] = theano.shared(self.args['embs']['dist3'].astype(theano.config.floatX)) if self.args['updateEmbs']: print '@@@@@@@ Will update distance embedding tables' self.container['params'] += [self.container['embDict']['dist1'], self.container['embDict']['dist2'], self.container['embDict']['dist3']] self.container['names'] += ['dist1', 'dist2', 'dist3'] if self.args['triggerGlob'] == 0: self.container['embDict']['trigger'] = theano.shared(self.args['embs']['trigger'].astype(theano.config.floatX)) if self.args['updateEmbs']: print '@@@@@@@ Will update trigger embedding table' self.container['params'] += [ self.container['embDict']['trigger'] ] self.container['names'] += ['trigger'] #self.container['sentLength'] = T.ivector('sentLength') self.container['triggerAnn'] = T.imatrix('triggerAnn') self.container['triggerMaskTrain'] = T.matrix('triggerMaskTrain') self.container['triggerMaskTest'] = T.imatrix('triggerMaskTest') self.container['triggerMaskTrainArg'] = T.matrix('triggerMaskTrainArg') self.container['triggerMaskTestArg'] = T.imatrix('triggerMaskTestArg') self.container['entities'] = T.imatrix('entities') self.container['argumentEntityIdAnn'] = T.itensor3('argumentEntityIdAnn') self.container['argumentPosAnn'] = T.itensor3('argumentPosAnn') self.container['argumentLabelAnn'] = T.itensor3('argumentLabelAnn') self.container['argumentMaskTrain'] = T.tensor3('argumentMaskTrain') self.container['possibleEnityIdByTrigger'] = T.itensor3('possibleEnityIdByTrigger') self.container['possibleEnityPosByTrigger'] = T.itensor3('possibleEnityPosByTrigger') self.container['argumentMaskTest'] = T.itensor3('argumentMaskTest') self.container['relDistBinary'] = T.tensor4('relDistBinary') #dimshuffle(1,0,2,3) first self.container['relDistIdxs'] = T.itensor3('relDistIdxs') #dimshuffle(1,0,2) first self.container['NodeFets'] = T.itensor3('NodeFets') self.container['EdgeFets'] = T.itensor4('EdgeFets') #self.container['numEntities'] = T.iscalar('numEntities') self.container['lr'] = T.scalar('lr') self.container['zeroVector'] = T.vector('zeroVector') self.glob = {} self.glob['batch'] = self.args['batch'] self.glob['maxSentLength'] = self.args['maxSentLength'] self.glob['numTrigger'] = self.args['numTrigger'] self.glob['numArg'] = self.args['numArg'] self.glob['maxNumEntities'] = self.args['maxNumEntities'] self.glob['eachTrigger'] = theano.shared(numpy.zeros([self.glob['batch'], self.glob['maxSentLength'], self.glob['numTrigger']]).astype(theano.config.floatX)) self.glob['eachArg'] = theano.shared(numpy.zeros([self.glob['batch'], self.glob['maxSentLength'], self.glob['numArg']]).astype(theano.config.floatX)) self.glob['eachTriggerId'] = theano.shared(numpy.zeros([self.glob['batch'], self.glob['maxSentLength']]).astype('int32')) self.glob['eachArgId'] = theano.shared(numpy.zeros([self.glob['batch'], self.glob['maxSentLength']]).astype('int32')) self.glob['trigger'] = theano.shared(numpy.zeros([self.glob['batch'], self.glob['numTrigger']]).astype(theano.config.floatX)) self.glob['arg'] = theano.shared(numpy.zeros([self.glob['batch'], self.glob['numArg']]).astype(theano.config.floatX)) self.glob['argTrigger'] = theano.shared(numpy.zeros([self.glob['batch'], self.glob['maxNumEntities'], self.glob['numTrigger']]).astype(theano.config.floatX)) self.glob['argArg'] = theano.shared(numpy.zeros([self.glob['batch'], self.glob['maxNumEntities'], self.glob['numArg']]).astype(theano.config.floatX)) self.globZero = {} self.globZero['eachTrigger'] = numpy.zeros([self.glob['batch'], self.glob['maxSentLength'], self.glob['numTrigger']]).astype(theano.config.floatX) self.globZero['eachArg'] = numpy.zeros([self.glob['batch'], self.glob['maxSentLength'], self.glob['numArg']]).astype(theano.config.floatX) self.globZero['eachTriggerId'] = numpy.zeros([self.glob['batch'], self.glob['maxSentLength']]).astype('int32') self.globZero['eachArgId'] = numpy.zeros([self.glob['batch'], self.glob['maxSentLength']]).astype('int32') self.globZero['trigger'] = numpy.zeros([self.glob['batch'], self.glob['numTrigger']]).astype(theano.config.floatX) self.globZero['arg'] = numpy.zeros([self.glob['batch'], self.glob['numArg']]).astype(theano.config.floatX) self.globZero['argTrigger'] = numpy.zeros([self.glob['batch'], self.glob['maxNumEntities'], self.glob['numTrigger']]).astype(theano.config.floatX) self.globZero['argArg'] = numpy.zeros([self.glob['batch'], self.glob['maxNumEntities'], self.glob['numArg']]).astype(theano.config.floatX) self.globVar = {} self.globVar['eachTrigger'] = T.tensor3() self.globVar['eachArg'] = T.tensor3() self.globVar['eachTriggerId'] = T.imatrix() self.globVar['eachArgId'] = T.imatrix() self.globVar['trigger'] = T.matrix() self.globVar['arg'] = T.matrix() self.globVar['argTrigger'] = T.tensor3() self.globVar['argArg'] = T.tensor3() self.globFunc = {} self.container['setZero'] = OrderedDict() self.container['zeroVecs'] = OrderedDict()
def main(data_path, model_path): print("Loading data...") with open(data_path,'r') as f: valX = pkl.load(f) print("Preparing data...") val_iter = batched_tweets.BatchedTweets(valX, batch_size=512, maxlen=MAX_LENGTH) print("Loading dictionary...") with open('%s/dict.pkl' % model_path, 'rb') as f: chardict = pkl.load(f) n_char = len(chardict.keys()) + 1 # check for model files files = sorted(glob.glob(model_path+'model_*.npz')) print("Found {} model files".format(len(files))) for modelf in files: print("Computing validation cost on {}".format(modelf)) print("Loading params...") params = load_params(modelf) print("Building network...") # Tweet variables tweet = T.itensor4() ptweet = T.itensor4() ntweet = T.itensor4() # masks t_mask = T.ftensor3() tp_mask = T.ftensor3() tn_mask = T.ftensor3() # Embeddings emb_t = char2word2vec(tweet, t_mask, params, n_char)[0] emb_tp = char2word2vec(ptweet, tp_mask, params, n_char)[0] emb_tn = char2word2vec(ntweet, tn_mask, params, n_char)[0] # batch cost D1 = 1 - T.batched_dot(emb_t, emb_tp)/(tnorm(emb_t)*tnorm(emb_tp)) D2 = 1 - T.batched_dot(emb_t, emb_tn)/(tnorm(emb_t)*tnorm(emb_tn)) gap = D1-D2+M loss = gap*(gap>0) cost = T.mean(loss) reg = REGULARIZATION*lasagne.regularization.regularize_network_params(char2word2vec(tweet, t_mask, params, n_char)[1], lasagne.regularization.l2) + REGULARIZATION*lasagne.regularization.regularize_network_params(char2word2vec(tweet, t_mask, params, n_char)[2], lasagne.regularization.l2) # Theano function print("Compiling theano function...") inps = [tweet,t_mask,ptweet,tp_mask,ntweet,tn_mask] cost_val = theano.function(inps,cost) reg_val = theano.function([], reg) print("Testing...") uidx = 0 try: validation_cost = 0. reg_cost = 0. n_val_samples = 0 for x,y,z in val_iter: if not x: print("Validation: Minibatch with no valid triples") continue n_val_samples += len(x) x, x_m, y, y_m, z, z_m = batched_tweets.prepare_data_c2w2s(x, y, z, chardict, maxwordlen=MAX_WORD_LENGTH, maxseqlen=MAX_SEQ_LENGTH, n_chars=n_char) if x==None: print("Validation: Minibatch with zero samples under maxlength") continue curr_cost = cost_val(x,x_m,y,y_m,z,z_m) validation_cost += curr_cost*len(x) reg_cost = reg_val() print("Model {} Validation Cost {} Regularization Cost {}".format(modelf, validation_cost/n_val_samples, reg_cost)) print("Seen {} samples.".format(n_val_samples)) except KeyboardInterrupt: pass
def create_network(self): def save_model(metrics, epoch_nr): max_f1_idx = np.argmax(metrics["f1_macro_validate"]) max_f1 = np.max(metrics["f1_macro_validate"]) if epoch_nr == max_f1_idx and max_f1 > 0.01: # saving to network drives takes 5s (to local only 0.5s) -> do not save so often print(" Saving weights...") for fl in glob.glob(join(self.HP.EXP_PATH, "best_weights_ep*") ): # remove weights from previous epochs os.remove(fl) try: np.savez( join(self.HP.EXP_PATH, "best_weights_ep" + str(epoch_nr) + ".npz"), *L.layers.get_all_param_values(self.output)) except IOError: print( "\nERROR: Could not save weights because of IO Error\n" ) self.HP.BEST_EPOCH = epoch_nr def load_model(path): ExpUtils.print_verbose(self.HP, "Loading weights ... ({})".format(path)) with np.load( path ) as f: #if both pathes are absolute and beginning of pathes are the same, join will merge the beginning param_values = [f['arr_%d' % i] for i in range(len(f.files))] L.layers.set_all_param_values(output_layer_for_loss, param_values) if self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "single_direction": # NR_OF_GRADIENTS = 15 # SH-Coeff NR_OF_GRADIENTS = 9 elif self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "combined": # NR_OF_GRADIENTS = 3 NR_OF_GRADIENTS = 3 * self.HP.NR_OF_CLASSES # NR_OF_GRADIENTS = self.HP.NR_OF_CLASSES else: NR_OF_GRADIENTS = 33 print("Building network ...") # Lasagne Seed for Reproducibility L.random.set_rng(np.random.RandomState(1)) net = self.get_UNet(n_input_channels=NR_OF_GRADIENTS, num_output_classes=self.HP.NR_OF_CLASSES, input_dim=self.HP.INPUT_DIM, base_n_filters=self.HP.UNET_NR_FILT) output_layer_for_loss = net["output_flat"] if self.HP.LOAD_WEIGHTS: load_model(join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH)) X_sym = T.tensor4() # y_sym = T.imatrix() # (bs*x*y, nr_of_classes) y_sym = T.itensor4() # (bs, nr_of_classes, x, y) y_sym_flat = y_sym.dimshuffle( (0, 2, 3, 1)) # (bs, x, y, nr_of_classes) y_sym_flat = y_sym_flat.reshape( (-1, y_sym_flat.shape[3])) # (bs*x*y, nr_of_classes) # add some weight decay # l2_loss = L.regularization.regularize_network_params(output_layer_for_loss, L.regularization.l2) * 1e-5 ##Train prediction_train = L.layers.get_output(output_layer_for_loss, X_sym, deterministic=False) loss_vec_train = L.objectives.binary_crossentropy( prediction_train, y_sym_flat) loss_vec_train = loss_vec_train.mean( axis=1 ) #before: (bs*x*y, nrClasses) (= elementwise binary CE), after: (bs*x*y) (= same shape as output from categorical CE) # loss_train = loss_vec_train.mean() + l2_loss loss_train = loss_vec_train.mean() ##Test prediction_test = L.layers.get_output(output_layer_for_loss, X_sym, deterministic=True) # prediction_test = L.layers.get_output(output_layer_for_loss, X_sym, deterministic=False) #for Dropout Sampling loss_vec_test = L.objectives.binary_crossentropy( prediction_test, y_sym_flat) loss_vec_test = loss_vec_test.mean(axis=1) # loss_test = loss_vec_test.mean() + l2_loss loss_test = loss_vec_test.mean() ##Parameter Updates all_params = L.layers.get_all_params(output_layer_for_loss, trainable=True) learning_rate = theano.shared(np.float32(self.HP.LEARNING_RATE)) # updates = L.updates.adam(loss_train, all_params, learning_rate) updates = L.updates.adamax(loss_train, all_params, learning_rate) ##Convenience function output_train = L.layers.get_output(net["output"], X_sym, deterministic=False) output_test = L.layers.get_output(net["output"], X_sym, deterministic=True) # output_test = L.layers.get_output(net["output"], X_sym, deterministic=False) #for Dropout Sampling #Calc F1 NEW (simpler) output_shuff_train = output_train.dimshuffle( (0, 3, 1, 2)) # (bs, nrClasses x, y) dice_scores_train = theano_binary_dice_per_instance_and_class( output_shuff_train, y_sym, dim=2, first_spatial_axis=2 ) # (bs, nrClasses) -> dice for each class in each batch f1_train = T.mean(dice_scores_train) #average over batches and classes output_shuff_test = output_test.dimshuffle( (0, 3, 1, 2)) # (bs, nrClasses x, y) dice_scores_test = theano_binary_dice_per_instance_and_class( output_shuff_test, y_sym, dim=2, first_spatial_axis=2 ) # (bs, nrClasses) -> dice for each class in each batch f1_test = T.mean(dice_scores_test) # average over batches and classes #Define Functions train_fn = theano.function( [X_sym, y_sym], [loss_train, prediction_train, f1_train], updates=updates ) # prediction_TEST, weil hier auch nicht Dropout will bei Score?? predict_fn = theano.function([X_sym, y_sym], [loss_test, prediction_test, f1_test]) get_probs = theano.function([X_sym], output_test) #Exporting variables self.learning_rate = learning_rate self.train = train_fn self.predict = predict_fn self.get_probs = get_probs # (bs, x, y, nrClasses) self.net = net # self.output = output_layer_for_loss # this is used for saving weights (could probably also be simplified) self.save_model = save_model self.load_model = load_model
def main(): # Where we'll save data to fname = sys.argv[0].split('.py')[0] curr_time = datetime.now().strftime('%d%H%M') save_dir = 'sample-' + fname + curr_time lrate = 5e-4 batch_size = 1 num_epochs = 100 crop_size = 360 input_var = T.tensor4('x') target_var = T.itensor4('y') images = np.load('images.npz')['arr_0'].astype( theano.config.floatX) / 255.0 labels = np.load('labels.npz')['arr_0'].astype(np.int32) num_classes = labels.shape[1] idx = np.arange(num_classes) idx = idx.reshape(1, num_classes, 1, 1) labels = labels / 255 labels = labels.astype(np.int32) * idx labels = np.sum(labels, axis=1, keepdims=True) np.random.seed(1234) idx = np.arange(images.shape[0]) np.random.shuffle(idx) X_train = images[idx[:-10]] y_train = labels[idx[:-10]] X_valid = images[idx[-10:]] y_valid = labels[idx[-10:]] # Compute class weights to balance dataset counts = [] for cl in xrange(num_classes): class_counts = 0 for img in y_train: class_counts += np.sum(img == cl) counts.append(class_counts) counts = np.array(counts).astype(theano.config.floatX) # We can either upscale the loss (i.e. multiply by a factor > 1), or # downscale the loss (multiply by a factor < 1). Here we do the latter counts = np.max(counts) / counts counts = counts / np.max(counts) counts[0] = counts[0] * 1.1 # stem counts[1] = counts[1] * 1.1 # tomato counts = T.as_tensor_variable(counts) # Build DenseNetwork input_shape = (None, 3, crop_size, crop_size) softmax, network = build_network(input_var, input_shape, num_classes) print 'Number of paramters: ', nn.count_params(network) preds = nn.get_output(softmax, deterministic=False) loss = lasagne.objectives.categorical_crossentropy(preds, target_var.flatten()) loss = loss * counts[target_var.flatten()] loss = T.mean(loss) + regularize_network_params(softmax, l2) * 0.0001 acc = T.mean(T.eq(T.argmax(preds, axis=1), target_var.flatten())) params = nn.get_all_params(softmax, trainable=True) updates = lasagne.updates.adam(loss, params, lrate) train_fn = theano.function([input_var, target_var], [loss, acc], updates=updates, allow_input_downcast=True) probs, preds = nn.get_output([softmax, network], deterministic=True) loss = lasagne.objectives.categorical_crossentropy(probs, target_var.flatten()) loss = loss * counts[target_var.flatten()] loss = T.mean(loss) + regularize_network_params(softmax, l2) * 0.0001 acc = T.mean(T.eq(T.argmax(probs, axis=1), target_var.flatten())) valid_fn = theano.function([input_var, target_var], [loss, acc, preds], allow_input_downcast=True) # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_acc = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True): inputs, targets = batch inputs, targets = random_crop(inputs, targets, crop_size, crop_size) err, acc = train_fn(inputs, targets) train_err += err train_acc += acc train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 valid_iou = np.zeros((num_classes, )) val_preds, val_inputs, val_targets = [], [], [] for batch in iterate_minibatches(X_valid, y_valid, batch_size, shuffle=False): inputs, targets = batch input_crop, target_crop = random_crop(inputs, targets, crop_size, crop_size) err, acc, preds = valid_fn(input_crop, target_crop) val_err += err val_acc += acc val_batches += 1 val_preds.append(preds) val_inputs.append(input_crop) val_targets.append(target_crop) valid_iou += meanIOU(preds, target_crop, num_classes) if epoch % 2 == 0: val_preds = np.vstack(val_preds) val_inputs = np.vstack(val_inputs) val_targets = np.vstack(val_targets) plot_predictions(val_inputs, val_preds, val_targets, epoch, save_dir) # Then we print the results for this epoch: print "Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time) print " training loss:\t\t{:.6f}".format(train_err / train_batches) print " validation loss:\t\t{:.6f}".format(val_err / val_batches) print " validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100) print " validation IOU:\t\t{}".format(valid_iou / val_batches)
)) output = T.concatenate([masked_targets, output], axis=1) output = T.nnet.relu(lib.ops.conv2d.Conv2D('Dec2.Pix3', input_dim=2*DIM_3, output_dim=DIM_PIX_2, filter_size=3, inputs=output, mask_type=('b', 1))) output = T.nnet.relu(lib.ops.conv2d.Conv2D('Dec2.Pix4', input_dim=DIM_PIX_2, output_dim=DIM_PIX_2, filter_size=3, inputs=output, mask_type=('b', 1))) output = T.nnet.relu(lib.ops.conv2d.Conv2D('Dec2.Pix7', input_dim=DIM_PIX_2, output_dim=DIM_PIX_2, filter_size=1, inputs=output, mask_type=('b', 1))) output = T.nnet.relu(lib.ops.conv2d.Conv2D('Dec2.Pix8', input_dim=DIM_PIX_2, output_dim=DIM_PIX_2, filter_size=1, inputs=output, mask_type=('b', 1))) output = lib.ops.conv2d.Conv2D('Dec2.Out', input_dim=DIM_PIX_2, output_dim=2*LATENT_DIM_1, filter_size=1, inputs=output, mask_type=('b', 1), he_init=False) return output total_iters = T.iscalar('total_iters') images = T.itensor4('images') # shape: (batch size, n channels, height, width) alpha = T.minimum(1, T.cast(total_iters, theano.config.floatX) / lib.floatX(ALPHA_ITERS)) def split(mu_and_logsig): mu, logsig = mu_and_logsig[:,::2], mu_and_logsig[:,1::2] logsig = T.log(T.nnet.softplus(logsig)) return mu, logsig def clamp_logsig(logsig): beta = T.minimum(1, T.cast(total_iters, theano.config.floatX) / lib.floatX(BETA_ITERS)) return T.nnet.relu(logsig, alpha=beta) # Layer 1 mu_and_logsig1 = Enc1(images)
PIXEL_CNN_LAYERS = 4 LR = 2e-4 BATCH_SIZE = 100 N_CHANNELS = 1 HEIGHT = 28 WIDTH = 28 TIMES = ('iters', 10 * 500, 1000 * 500) lib.print_model_settings(locals().copy()) # inputs.shape: (batch size, n channels, height, width) if MODE == '256ary': inputs = T.itensor4('inputs') inputs_embed = lib.ops.embedding.Embedding('Embedding', 256, DIM, inputs) inputs_embed = inputs_embed.dimshuffle(0, 1, 4, 2, 3) inputs_embed = inputs_embed.reshape( (inputs_embed.shape[0], inputs_embed.shape[1] * inputs_embed.shape[2], inputs_embed.shape[3], inputs_embed.shape[4])) output = lib.ops.conv2d.Conv2D('InputConv', input_dim=N_CHANNELS * DIM, output_dim=DIM, filter_size=7, inputs=inputs_embed, mask_type=('a', N_CHANNELS), he_init=False) else: inputs = T.tensor4('inputs')
from theano_toolkit.parameters import Parameters from theano_toolkit import updates import data_io import model import math from pprint import pprint import vae if __name__ == "__main__": chunk_size = 512 batch_size = 64 P = Parameters() autoencoder, inpaint = model.build(P) parameters = P.values() X = T.itensor4('X') X_hat, posteriors, priors = autoencoder(T.cast(X, 'float32') / 255.) latent_kls = [ T.mean(vae.kl_divergence(po_m, po_s, pr_m, pr_s), axis=0) for (po_m, po_s), (pr_m, pr_s) in zip(posteriors, priors) ] beta_start = 500 * (np.arange(len(latent_kls)) + 1) beta_lin = theano.shared(np.float32(0)) betas_ = (beta_lin - beta_start) / np.float32(500) betas_ = T.switch(betas_ < 0, 0, betas_) betas = T.switch(betas_ > 1, 1, betas_)[::-1] print betas.eval() train_latent_kl = sum(betas[i] * kl for i, kl in enumerate(latent_kls)) latent_kl = sum(latent_kls) recon_loss = model.cost(X_hat, X[:, :, 16:-16, 16:-16])
def classifier(input_vars, network_build_fn, n_target_spatial_dims=0, target_channel_index=None, score=dnn_objective.ClassifierObjective.SCORE_ERROR, mask=False, includes_softmax=False, params_source=None, *args, **kwargs): """ Construct a classifier, given input variables and a network building function and an optional path from which to load parameters. :param input_vars: a list of input variables. If `None`, the network will be searched for `InputLayer` instances and their input variables will be used. :param network_build_fn: network builder function of the form `fn(input_vars) -> lasagne_layer` that constructs a network in the form of a Lasagne layer, given an input variable (a Theano variable) :param n_target_spatial_dims: the number of spatial dimensions for the target; 0 for predict per sample with ivector variable type 1 for 1-dimensional prediction e.g. time series, with imatrix variable type (sample, time), 2 for 2-dimensional prediction e.g. image, with itensor3 variable type (sample, height, width), 3 for 3-dimensional prediction e.g. volume, with itensor4 variable type (sample, depth, height, width), :param target_channel_index: if None, targets are assumed not to have a channel dimension. If an integer, then this channel will be used for the target, e.g. for a target with 0 spatial dimensions, if `target_channel_index` is `None` then the targets should have shape `(sample,)`, while if there are 5 channels and the target uses channel 2, the target should have shape `(sample, 5)` and we will access the target indices in channel, e.g. `y[:,2]`. Note that the additional channel dimension adds an additional dimension to target and mask variables, e.g. 0, 1, 2 and 3 dimensional targets and masks use imatrix, itensor3, itensor4 and itensor5 variable types. :param score: the scoring metric used to evaluate classifier performance (see `dnn_objective.ClassifierObjective`) :param mask: (default=False) if True, samples will be masked, in which case sample weights/masks should be passed during training :param includes_softmax: `True` indicates that the final network layer includes the softmax non-linearity, `False` indicates that it does not, in which case a non-linearity layer will be added :param params_source: [optional] source from which to obtain network parameters; either a str/unicode that contains the path of a NumPy array file from which to load the parameters, or a `BasicDNN` or Lasagne layer from which to copy the parameters :return: a classifier instance """ # Prepare Theano variables for inputs and targets n_target_tims = n_target_spatial_dims + (0 if target_channel_index is None else 1) if n_target_tims == 0: target_var = T.ivector('y') elif n_target_tims == 1: target_var = T.imatrix('y') elif n_target_tims == 2: target_var = T.itensor3('y') elif n_target_tims == 3: target_var = T.itensor4('y') else: raise ValueError('Valid values for n_target_spatial_dims are in the range 0-3, not {}'.format( n_target_spatial_dims)) if mask: if n_target_tims == 0: mask_var = T.vector('m') elif n_target_tims == 1: mask_var = T.matrix('m') elif n_target_tims == 2: mask_var = T.tensor3('m') elif n_target_tims == 3: mask_var = T.tensor4('m') else: raise ValueError('Valid values for n_target_spatial_dims are in the range 0-3, not {}'.format( n_target_spatial_dims)) mask_vars = [mask_var] else: mask_var = None mask_vars = [] # Build the network network = network_build_fn(input_vars=input_vars) if input_vars is None: input_vars = _get_input_vars(network) objective = dnn_objective.ClassifierObjective('y', network, target_var, mask_expr=mask_var, n_target_spatial_dims=n_target_spatial_dims, target_channel_index=target_channel_index, score=score, includes_softmax=includes_softmax) return BasicClassifierDNN(input_vars, [target_var] + mask_vars, network, objective, params_source=params_source, *args, **kwargs)
def create_network(self): if self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "single_direction": # NR_OF_GRADIENTS = 15 # SH-Coeff NR_OF_GRADIENTS = 9 elif self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "combined": # NR_OF_GRADIENTS = 3 NR_OF_GRADIENTS = 3*self.HP.NR_OF_CLASSES # 54 else: NR_OF_GRADIENTS = 33 if self.HP.RESOLUTION == "1.25mm": input_dim = (144, 144) elif self.HP.RESOLUTION == "2mm" or self.HP.RESOLUTION == "2.5mm": input_dim = (80, 80) print("Building network ...") print("(Model UNet)") # Lasagne Seed for Reproducibility L.random.set_rng(np.random.RandomState(1)) net = self.get_UNet(n_input_channels=NR_OF_GRADIENTS, num_output_classes=self.HP.NR_OF_CLASSES, input_dim=input_dim, base_n_filters=self.HP.UNET_NR_FILT) output_layer_for_loss = net["output_flat"] if self.HP.LOAD_WEIGHTS: print("Loading weights ... ({})".format(join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH))) with np.load(join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH)) as f: #if both pathes are absolute and beginning of pathes are the same, join will merge the beginning param_values = [f['arr_%d' % i] for i in range(len(f.files))] L.layers.set_all_param_values(output_layer_for_loss, param_values) X_sym = T.tensor4() w_sym = T.dvector() # y_sym = T.dmatrix() y_sym = T.itensor4() # (bs, nr_of_classes, x, y) y_sym_flat = y_sym.dimshuffle((0, 2, 3, 1)) # (bs, x, y, nr_of_classes) y_sym_flat = y_sym_flat.reshape((-1, y_sym_flat.shape[3])) # (bs*x*y, nr_of_classes) # add some weight decay # l2_loss = L.regularization.regularize_network_params(output_layer_for_loss, L.regularization.l2) * 1e-5 ##Train prediction_train = L.layers.get_output(output_layer_for_loss, X_sym, deterministic=False) loss_vec_train = L.objectives.squared_error(prediction_train, y_sym_flat) loss_vec_train = loss_vec_train.mean(axis=1) #before: (bs*x*y, nrClasses) (= elementwise binary CE), after: (bs*x*y) (= same shape as output from categorical CE) loss_vec_train *= w_sym # loss_train = loss_vec_train.mean() + l2_loss loss_train = loss_vec_train.mean() ##Test prediction_test = L.layers.get_output(output_layer_for_loss, X_sym, deterministic=True) loss_vec_test = L.objectives.squared_error(prediction_test, y_sym_flat) loss_vec_test = loss_vec_test.mean(axis=1) loss_vec_test *= w_sym # loss_test = loss_vec_test.mean() + l2_loss loss_test = loss_vec_test.mean() ##Parameter Updates all_params = L.layers.get_all_params(output_layer_for_loss, trainable=True) # learning_rate = theano.shared(floatX(0.0001)) learning_rate = theano.shared(np.float32(self.HP.LEARNING_RATE)) # updates = L.updates.adam(loss_train, all_params, learning_rate) updates = L.updates.adamax(loss_train, all_params, learning_rate) ##Convenience function output = L.layers.get_output(net["output"], X_sym, deterministic=True) #Calc F1 f1_per_call_train, _ = theano.scan(theano_f1_score, outputs_info=None, sequences=[theano.tensor.arange(y_sym_flat.shape[1])], non_sequences=[prediction_train, y_sym_flat]) f1_per_call_test, _ = theano.scan(theano_f1_score, outputs_info=None, sequences=[theano.tensor.arange(y_sym.shape[1])], non_sequences=[prediction_test, y_sym_flat]) f1_train = T.mean(f1_per_call_train) f1_test = T.mean(f1_per_call_test) train_fn = theano.function([X_sym, y_sym, w_sym], [loss_train, prediction_train, f1_train], updates=updates) #prediction_TEST, weil hier auch nicht Dropout will bei Score?? predict_fn = theano.function([X_sym, y_sym, w_sym], [loss_test, prediction_test, f1_test]) get_probs = theano.function([X_sym], output) #Exporting variables self.learning_rate = learning_rate self.train = train_fn self.predict = predict_fn self.get_probs = get_probs # (bs, x, y, nrClasses) self.net = net self.output = output_layer_for_loss # this is used for saving weights (could probably also be simplified)
def compileTheanoFunctions(self): print(" ----------------- Starting compilation process ----------------- ") # ------- Create and initialize sharedVariables needed to compile the training function ------ # # -------------------------------------------------------------------------------------------- # # For training self.trainingData_x = theano.shared(np.zeros([1,1,1,1,1], dtype="float32"), borrow = True) self.trainingData_y = theano.shared(np.zeros([1,1,1,1], dtype="float32") , borrow = True) self.trainingData_x_Bottom = theano.shared(np.zeros([1,1,1,1,1], dtype="float32"), borrow = True) # For testing self.testingData_x_Bottom = theano.shared(np.zeros([1,1,1,1,1], dtype="float32"), borrow = True) self.testingData_x = theano.shared(np.zeros([1,1,1,1,1], dtype="float32"), borrow = True) x_Train = self.inputNetwork_Train x_Train_Bottom = self.inputNetwork_Train_Bottom x_Test = self.inputNetwork_Test x_Test_Bottom = self.inputNetwork_Test_Bottom y_Train = T.itensor4('y') # Allocate symbolic variables for the data index_Train = T.lscalar() index_Test = T.lscalar() # ------- Needed to compile the training function ------ # # ------------------------------------------------------ # trainingData_y_CastedToInt = T.cast( self.trainingData_y, 'int32') # To accomodate the weights in the cost function to account for class imbalance weightsOfClassesInCostFunction = T.fvector() weightPerClass = T.fvector() # --------- Get trainable parameters (to be fit by gradient descent) ------- # # -------------------------------------------------------------------------- # [paramsTraining, numberParamsPerLayer] = self.getTrainable_Params() # ------------------ Define the cost function --------------------- # # ----------------------------------------------------------------- # def negLogLikelihood(): print (" --- Cost function: negativeLogLikelihood") costInLastLayer = self.lastLayer.negativeLogLikelihoodWeighted(y_Train,weightPerClass) return costInLastLayer def NotDefined(): print (" --- Cost function: Not defined!!!!!! WARNING!!!") optionsCostFunction = {0 : negLogLikelihood, 1 : NotDefined} costInLastLayer = optionsCostFunction[self.costFunction]() # --------------------------- Get costs --------------------------- # # ----------------------------------------------------------------- # # Get L1 and L2 weights regularization costL1 = 0 costL2 = 0 # Compute the costs for l_i in xrange(0, len(self.networkLayers)) : costL1 += abs(self.networkLayers[l_i].W).sum() costL2 += (self.networkLayers[l_i].W ** 2).sum() # Add also the cost of the last layer cost = (costInLastLayer + self.L1_reg_C * costL1 + self.L2_reg_C * costL2) # --------------------- Include all trainable parameters in updates (for optimization) ---------------------- # # ----------------------------------------------------------------------------------------------------------- # updates = self.getUpdatesOfTrainableParameters(cost, paramsTraining, numberParamsPerLayer) # --------------------- Include batch normalization params ---------------------- # # ------------------------------------------------------------------------------- # updates = updates + self.updateParams_BatchNorm() # For the testing function we need to get the Feature maps activations featMapsActivations = [] lower_act = 0 upper_act = 9999 # TODO: Change to output_Test for l_i in xrange(0,len(self.networkLayers)): featMapsActivations.append(self.networkLayers[l_i].outputTest[:, lower_act : upper_act, :, :, :]) # For the last layer get the predicted probabilities (p_y_given_x_test) featMapsActivations.append(self.lastLayer.p_y_given_x_test) # --------------------- Preparing data to compile the functions ---------------------- # # ------------------------------------------------------------------------------------ # givensDataSet_Train = { x_Train: self.trainingData_x[index_Train * self.batch_Size: (index_Train + 1) * self.batch_Size], x_Train_Bottom: self.trainingData_x_Bottom[index_Train * self.batch_Size: (index_Train + 1) * self.batch_Size], y_Train: trainingData_y_CastedToInt[index_Train * self.batch_Size: (index_Train + 1) * self.batch_Size], weightPerClass: weightsOfClassesInCostFunction } givensDataSet_Test = { x_Test: self.testingData_x[index_Test * self.batch_Size: (index_Test + 1) * self.batch_Size], x_Test_Bottom: self.testingData_x_Bottom[index_Test * self.batch_Size: (index_Test + 1) * self.batch_Size] } print(" ...Compiling the training function...") self.networkModel_Train = theano.function( [index_Train, weightsOfClassesInCostFunction], #[cost] + self.lastLayer.doEvaluation(y_Train), [cost], updates=updates, givens = givensDataSet_Train ) print(" ...The training function was compiled...") #self.getProbabilities = theano.function( #[index], #self.lastLayer.p_y_given_x_Train, #givens={ #x: self.trainingData_x[index * _self.batch_size: (index + 1) * _self.batch_size] #} #) print(" ...Compiling the testing function...") self.networkModel_Test = theano.function( [index_Test], featMapsActivations, givens = givensDataSet_Test ) print(" ...The testing function was compiled...")
def compileTheanoFunctions(self): print( " ----------------- Starting compilation process ----------------- " ) # ------- Create and initialize sharedVariables needed to compile the training function ------ # # -------------------------------------------------------------------------------------------- # # For training self.trainingData_x = theano.shared(np.zeros([1, 1, 1, 1, 1], dtype="float32"), borrow=True) self.trainingData_y = theano.shared(np.zeros([1, 1, 1, 1], dtype="float32"), borrow=True) # For testing self.testingData_x = theano.shared(np.zeros([1, 1, 1, 1, 1], dtype="float32"), borrow=True) x_Train = self.inputNetwork_Train x_Test = self.inputNetwork_Test y_Train = T.itensor4('y') # Allocate symbolic variables for the data index_Train = T.lscalar() index_Test = T.lscalar() # ------- Needed to compile the training function ------ # # ------------------------------------------------------ # trainingData_y_CastedToInt = T.cast(self.trainingData_y, 'int32') # To accomodate the weights in the cost function to account for class imbalance weightsOfClassesInCostFunction = T.fvector() weightPerClass = T.fvector() # --------- Get trainable parameters (to be fit by gradient descent) ------- # # -------------------------------------------------------------------------- # [paramsTraining, numberParamsPerLayer] = self.getTrainable_Params() # ------------------ Define the cost function --------------------- # # ----------------------------------------------------------------- # def negLogLikelihood(): print(" --- Cost function: negativeLogLikelihood") costInLastLayer = self.lastLayer.negativeLogLikelihoodWeighted( y_Train, weightPerClass) return costInLastLayer def NotDefined(): print(" --- Cost function: Not defined!!!!!! WARNING!!!") optionsCostFunction = {0: negLogLikelihood, 1: NotDefined} costInLastLayer = optionsCostFunction[self.costFunction]() # --------------------------- Get costs --------------------------- # # ----------------------------------------------------------------- # # Get L1 and L2 weights regularization costL1 = 0 costL2 = 0 # Compute the costs for l_i in xrange(0, len(self.networkLayers)): costL1 += abs(self.networkLayers[l_i].W).sum() costL2 += (self.networkLayers[l_i].W**2).sum() # Add also the cost of the last layer cost = (costInLastLayer + self.L1_reg_C * costL1 + self.L2_reg_C * costL2) # --------------------- Include all trainable parameters in updates (for optimization) ---------------------- # # ----------------------------------------------------------------------------------------------------------- # updates = self.getUpdatesOfTrainableParameters(cost, paramsTraining, numberParamsPerLayer) # --------------------- Include batch normalization params ---------------------- # # ------------------------------------------------------------------------------- # updates = updates + self.updateParams_BatchNorm() # For the testing function we need to get the Feature maps activations featMapsActivations = [] lower_act = 0 upper_act = 9999 # TODO: Change to output_Test for l_i in xrange(0, len(self.networkLayers)): featMapsActivations.append( self.networkLayers[l_i]. outputTest[:, lower_act:upper_act, :, :, :]) # For the last layer get the predicted probabilities (p_y_given_x_test) featMapsActivations.append(self.lastLayer.p_y_given_x_test) # --------------------- Preparing data to compile the functions ---------------------- # # ------------------------------------------------------------------------------------ # givensDataSet_Train = { x_Train: self.trainingData_x[index_Train * self.batch_Size:(index_Train + 1) * self.batch_Size], y_Train: trainingData_y_CastedToInt[index_Train * self.batch_Size:(index_Train + 1) * self.batch_Size], weightPerClass: weightsOfClassesInCostFunction } givensDataSet_Test = { x_Test: self.testingData_x[index_Test * self.batch_Size:(index_Test + 1) * self.batch_Size] } print(" ...Compiling the training function...") self.networkModel_Train = theano.function( [index_Train, weightsOfClassesInCostFunction], #[cost] + self.lastLayer.doEvaluation(y_Train), [cost], updates=updates, givens=givensDataSet_Train) print(" ...The training function was compiled...") #self.getProbabilities = theano.function( #[index], #self.lastLayer.p_y_given_x_Train, #givens={ #x: self.trainingData_x[index * _self.batch_size: (index + 1) * _self.batch_size] #} #) print(" ...Compiling the testing function...") self.networkModel_Test = theano.function([index_Test], featMapsActivations, givens=givensDataSet_Test) print(" ...The testing function was compiled...")
print("loading data...") trainX = pkl.load(open(data_path,'r')) # dictionary chardict, charcount = batched_tweets.build_dictionary(trainX[0] + trainX[1]) n_char = len(chardict.keys()) + 1 # model params params = init_params_c2w2s(n_chars=n_char) # batches print("preparing batches...") train_iter = batched_tweets.BatchedTweets(trainX, batch_size=N_BATCH, maxlen=MAX_LENGTH) # Tweet variables tweet = T.itensor4() ptweet = T.itensor4() ntweet = T.itensor4() # masks t_mask = T.ftensor3() tp_mask = T.ftensor3() tn_mask = T.ftensor3() # Embeddings emb_t = char2word2vec(tweet, t_mask, params, n_char)[0] emb_tp = char2word2vec(ptweet, tp_mask, params, n_char)[0] emb_tn = char2word2vec(ntweet, tn_mask, params, n_char)[0] # batch loss D1 = 1 - T.batched_dot(emb_t, emb_tp)/(tnorm(emb_t)*tnorm(emb_tp))
def test_embedding_layer(): print "Testing embedding layer..." for k in xrange(1): print "Layer %i..." % k # random parameters input_dim = np.random.randint(1, 100) output_dim = np.random.randint(1, 100) # embedding layer embedding_layer = layer.EmbeddingLayer(input_dim, output_dim, 'test') for i in xrange(40): print "%i" % i, # tests for dimension 1, 2, 3 and 4 if i % 4 == 0: input = T.ivector('input_test') input_value = np.random.randint( low=0, high=input_dim, size=(np.random.randint(low=1, high=50),) ).astype(np.int32) elif i % 4 == 1: input = T.imatrix('input_test') input_value = np.random.randint( low=0, high=input_dim, size=(np.random.randint(low=1, high=40), np.random.randint(low=1, high=40)) ).astype(np.int32) elif i % 4 == 2: input = T.itensor3('input_test') input_value = np.random.randint( low=0, high=input_dim, size=(np.random.randint(low=1, high=30), np.random.randint(low=1, high=30), np.random.randint(low=1, high=30)) ).astype(np.int32) else: input = T.itensor4('input_test') input_value = np.random.randint( low=0, high=input_dim, size=(np.random.randint(low=1, high=20), np.random.randint(low=1, high=20), np.random.randint(low=1, high=20), np.random.randint(low=1, high=20)) ).astype(np.int32) output = embedding_layer.link(input) expected_value = embedding_layer.embeddings.get_value()[input_value] assert expected_value.shape == input_value.shape + (output_dim,) np.testing.assert_array_almost_equal( output.eval({input: input_value}), expected_value ) print "OK" print "All tests ran successfully for Embedding Layer."
else: return lib.ops.conv_decoder.ConvDecoder( 'Decoder', input_dim=LATENT_DIM, n_unpools=CONV_N_POOLS, base_n_filters=CONV_BASE_N_FILTERS, filter_size=CONV_FILTER_SIZE, output_size=WIDTH, output_n_channels=N_CHANNELS, inputs=latents ) total_iters = T.iscalar('total_iters') if MODE=='256ary': images = T.itensor4('images') else: images = T.tensor4('images') # shape (batch size, n channels, height, width) mu, log_sigma = Encoder(images) if VANILLA: latents = mu else: eps = T.cast(theano_srng.normal(mu.shape), theano.config.floatX) latents = mu + (eps * T.exp(log_sigma)) outputs = Decoder(latents) if MODE=='256ary': reconst_cost = T.nnet.categorical_crossentropy(
def create_network(self): if self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "single_direction": # NR_OF_GRADIENTS = 15 # SH-Coeff NR_OF_GRADIENTS = 9 elif self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "combined": # NR_OF_GRADIENTS = 3 NR_OF_GRADIENTS = 3 * self.HP.NR_OF_CLASSES # 54 else: NR_OF_GRADIENTS = 33 if self.HP.RESOLUTION == "1.25mm": input_dim = (144, 144) elif self.HP.RESOLUTION == "2mm" or self.HP.RESOLUTION == "2.5mm": input_dim = (80, 80) print("Building network ...") print("(Model UNet)") # Lasagne Seed for Reproducibility L.random.set_rng(np.random.RandomState(1)) net = self.get_UNet(n_input_channels=NR_OF_GRADIENTS, num_output_classes=self.HP.NR_OF_CLASSES, input_dim=input_dim, base_n_filters=self.HP.UNET_NR_FILT) output_layer_for_loss = net["output_flat"] if self.HP.LOAD_WEIGHTS: print("Loading weights ... ({})".format( join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH))) with np.load( join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH) ) as f: #if both pathes are absolute and beginning of pathes are the same, join will merge the beginning param_values = [f['arr_%d' % i] for i in range(len(f.files))] L.layers.set_all_param_values(output_layer_for_loss, param_values) X_sym = T.tensor4() # y_sym = T.imatrix() # (bs*x*y, nr_of_classes) y_sym = T.itensor4() # (bs, nr_of_classes, x, y) prediction_train = L.layers.get_output(output_layer_for_loss, X_sym, deterministic=False) prediction_test = L.layers.get_output(output_layer_for_loss, X_sym, deterministic=True) output_train = L.layers.get_output(net["output"], X_sym, deterministic=False) output_test = L.layers.get_output(net["output"], X_sym, deterministic=True) #Calc F1 NEW (simpler) output_shuff_train = output_train.dimshuffle( (0, 3, 1, 2)) # (bs, nrClasses x, y) dice_scores_train = theano_binary_dice_per_instance_and_class( output_shuff_train, y_sym, dim=2, first_spatial_axis=2 ) # (bs, nrClasses) -> dice for each class in each batch f1_train = T.mean(dice_scores_train) #average over batches and classes dice_scores_train_continuous = theano_binary_dice_per_instance_and_class_for_loss( output_shuff_train, y_sym, dim=2, first_spatial_axis=2 ) # (bs, nrClasses) -> dice for each class in each batch f1_train_continuous = T.mean( dice_scores_train_continuous) # average over batches and classes output_shuff_test = output_test.dimshuffle( (0, 3, 1, 2)) # (bs, nrClasses x, y) dice_scores_test = theano_binary_dice_per_instance_and_class( output_shuff_test, y_sym, dim=2, first_spatial_axis=2 ) # (bs, nrClasses) -> dice for each class in each batch f1_test = T.mean(dice_scores_test) # average over batches and classes dice_scores_test_continuous = theano_binary_dice_per_instance_and_class_for_loss( output_shuff_test, y_sym, dim=2, first_spatial_axis=2 ) # (bs, nrClasses) -> dice for each class in each batch f1_test_continuous = T.mean( dice_scores_test_continuous) # average over batches and classes loss_train = 1 - f1_train_continuous loss_test = 1 - f1_test_continuous ##Parameter Updates all_params = L.layers.get_all_params(output_layer_for_loss, trainable=True) learning_rate = theano.shared(np.float32(self.HP.LEARNING_RATE)) # updates = L.updates.adam(loss_train, all_params, learning_rate) updates = L.updates.adamax(loss_train, all_params, learning_rate) # Define Functions train_fn = theano.function( [X_sym, y_sym], [loss_train, prediction_train, f1_train], updates=updates ) # prediction_TEST, weil hier auch nicht Dropout will bei Score?? predict_fn = theano.function([X_sym, y_sym], [loss_test, prediction_test, f1_test]) get_probs = theano.function([X_sym], output_test) #Exporting variables self.learning_rate = learning_rate self.train = train_fn self.predict = predict_fn self.get_probs = get_probs # (bs, x, y, nrClasses) self.net = net self.output = output_layer_for_loss # this is used for saving weights (could probably also be simplified)
def train(args): print '\nNEURAL POS TAGGER START\n' print '\tINITIAL EMBEDDING\t%s %s' % (args.word_list, args.emb_list) print '\tWORD\t\t\tEmb Dim: %d Hidden Dim: %d' % (args.w_emb_dim, args.w_hidden_dim) print '\tCHARACTER\t\tEmb Dim: %d Hidden Dim: %d' % (args.c_emb_dim, args.c_hidden_dim) print '\tOPTIMIZATION\t\tMethod: %s Learning Rate: %f\n' % (args.opt, args.lr) print '\tMINI-BATCH: %d\n' % args.batch_size """ load data """ print 'Loading data sets...\n' train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len = io_utils.load_conll(args.train_data) """ limit data set """ train_corpus = train_corpus[:args.data_size] train_corpus.sort(key=lambda a: len(a)) dev_corpus = None if args.dev_data: dev_corpus, dev_vocab_word, dev_vocab_char, dev_vocab_tag, max_char_len_dev = io_utils.load_conll(args.dev_data) for w in dev_vocab_word.i2w: if args.vocab_size is None or vocab_word.size() < args.vocab_size: vocab_word.add_word(w) for c in dev_vocab_char.i2w: vocab_char.add_word(c) for t in dev_vocab_tag.i2w: vocab_tag.add_word(t) if args.save: io_utils.dump_data(vocab_word, 'vocab_word') io_utils.dump_data(vocab_char, 'vocab_char') io_utils.dump_data(vocab_tag, 'vocab_tag') """ load pre-trained embeddings """ init_w_emb = None if args.emb_list: print '\tLoading word embeddings...\n' init_w_emb = io_utils.load_init_emb(args.emb_list, args.word_list, vocab_word) w_emb_dim = init_w_emb.shape[1] else: w_emb_dim = args.w_emb_dim """ converting into ids """ print '\nConverting into IDs...\n' tr_x, tr_c, tr_b, tr_y = convert_into_ids(train_corpus, vocab_word, vocab_char, vocab_tag, max_char_len) tr_x, tr_c, tr_y, tr_b = set_minibatch(tr_x, tr_c, tr_y, max_char_len, args.batch_size) tr_x, tr_c, tr_y = shared_samples(tr_x, tr_c, tr_y) if args.dev_data: dev_x, dev_c, dev_b, dev_y = convert_into_ids(dev_corpus, vocab_word, vocab_char, vocab_tag, max_char_len_dev) dev_x, dev_c, dev_y, dev_b = set_minibatch(dev_x, dev_c, dev_y, max_char_len_dev, 1) dev_x, dev_c, dev_y = shared_samples(dev_x, dev_c, dev_y) print '\tTrain Sentences: %d Dev Sentences: %d' % (len(train_corpus), len(dev_corpus)) else: print '\tTrain Sentences: %d' % len(train_corpus) print '\tWord size: %d Char size: %d' % (vocab_word.size(), vocab_char.size()) """ set model parameters """ w_hidden_dim = args.w_hidden_dim c_emb_dim = args.c_emb_dim c_hidden_dim = args.c_hidden_dim output_dim = vocab_tag.size() window = args.window opt = args.opt """ symbol definition """ print '\tCompiling Theano Code...' bos = T.iscalar('bos') eos = T.iscalar('eos') n_words = T.iscalar('n_words') batch_size = T.iscalar('batch_size') x = T.imatrix('x') c = T.itensor4('c') y = T.ivector('y') lr = T.fscalar('lr') """ tagger set up """ tagger = Model(x=x, c=c, y=y, n_words=n_words, batch_size=batch_size, lr=lr, init_emb=init_w_emb, vocab_w_size=vocab_word.size(), w_emb_dim=w_emb_dim, w_hidden_dim=w_hidden_dim, c_emb_dim=c_emb_dim, c_hidden_dim=c_hidden_dim, output_dim=output_dim, vocab_c_size=vocab_char.size(), window=window, opt=opt) train_f = theano.function( inputs=[bos, eos, n_words, batch_size, lr], outputs=[tagger.nll, tagger.result], updates=tagger.updates, givens={ x: tr_x[bos: eos], c: tr_c[bos: eos], y: tr_y[bos: eos] }, mode='FAST_RUN' ) dev_f = theano.function( inputs=[bos, eos, n_words, batch_size], outputs=tagger.result, givens={ x: dev_x[bos: eos], c: dev_c[bos: eos], y: dev_y[bos: eos] }, mode='FAST_RUN' ) def _train(): for epoch in xrange(args.epoch): _lr = args.lr / float(epoch+1) indices = range(len(tr_b)) random.shuffle(indices) print '\nEpoch: %d' % (epoch + 1) print '\tBatch Index: ', start = time.time() total = 0.0 correct = 0 losses = 0.0 for i, index in enumerate(indices): if i % 100 == 0 and i != 0: print i, sys.stdout.flush() boundary = tr_b[index] loss, corrects = train_f(boundary[0], boundary[1], boundary[2],boundary[3], _lr) assert math.isnan(loss) is False, i total += len(corrects) correct += np.sum(corrects) losses += loss end = time.time() print '\tTime: %f seconds' % (end - start) print '\tNegative Log Likelihood: %f' % losses print '\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct) _dev(dev_f) def _dev(model): print '\tBatch Index: ', start = time.time() total = 0.0 correct = 0 for index in xrange(len(dev_b)): if index % 100 == 0 and index != 0: print index, sys.stdout.flush() boundary = dev_b[index] corrects = model(boundary[0], boundary[1], boundary[2], boundary[3]) total += len(corrects) correct += np.sum(corrects) end = time.time() print '\tTime: %f seconds' % (end - start) print '\tAccuracy:%f Total:%d Correct:%d' % ((correct / total), total, correct) _train()
def create_network(self): if self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "single_direction": # NR_OF_GRADIENTS = 15 # SH-Coeff NR_OF_GRADIENTS = 9 elif self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "combined": # NR_OF_GRADIENTS = 3 NR_OF_GRADIENTS = 3*self.HP.NR_OF_CLASSES # 54 else: NR_OF_GRADIENTS = 33 if self.HP.RESOLUTION == "1.25mm": input_dim = (144, 144) elif self.HP.RESOLUTION == "2mm" or self.HP.RESOLUTION == "2.5mm": input_dim = (80, 80) print("Building network ...") print("(Model UNet)") # Lasagne Seed for Reproducibility L.random.set_rng(np.random.RandomState(1)) net = self.get_UNet(n_input_channels=NR_OF_GRADIENTS, num_output_classes=self.HP.NR_OF_CLASSES, input_dim=input_dim, base_n_filters=self.HP.UNET_NR_FILT) output_layer_for_loss = net["output_flat"] if self.HP.LOAD_WEIGHTS: print("Loading weights ... ({})".format(join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH))) with np.load(join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH)) as f: #if both pathes are absolute and beginning of pathes are the same, join will merge the beginning param_values = [f['arr_%d' % i] for i in range(len(f.files))] L.layers.set_all_param_values(output_layer_for_loss, param_values) X_sym = T.tensor4() # y_sym = T.imatrix() # (bs*x*y, nr_of_classes) y_sym = T.itensor4() # (bs, nr_of_classes, x, y) prediction_train = L.layers.get_output(output_layer_for_loss, X_sym, deterministic=False) prediction_test = L.layers.get_output(output_layer_for_loss, X_sym, deterministic=True) output_train = L.layers.get_output(net["output"], X_sym, deterministic=False) output_test = L.layers.get_output(net["output"], X_sym, deterministic=True) #Calc F1 NEW (simpler) output_shuff_train = output_train.dimshuffle((0, 3, 1, 2)) # (bs, nrClasses x, y) dice_scores_train = theano_binary_dice_per_instance_and_class(output_shuff_train, y_sym, dim=2, first_spatial_axis=2) # (bs, nrClasses) -> dice for each class in each batch f1_train = T.mean(dice_scores_train) #average over batches and classes dice_scores_train_continuous = theano_binary_dice_per_instance_and_class_for_loss(output_shuff_train, y_sym, dim=2, first_spatial_axis=2) # (bs, nrClasses) -> dice for each class in each batch f1_train_continuous = T.mean(dice_scores_train_continuous) # average over batches and classes output_shuff_test = output_test.dimshuffle((0, 3, 1, 2)) # (bs, nrClasses x, y) dice_scores_test = theano_binary_dice_per_instance_and_class(output_shuff_test, y_sym, dim=2, first_spatial_axis=2) # (bs, nrClasses) -> dice for each class in each batch f1_test = T.mean(dice_scores_test) # average over batches and classes dice_scores_test_continuous = theano_binary_dice_per_instance_and_class_for_loss(output_shuff_test, y_sym, dim=2, first_spatial_axis=2) # (bs, nrClasses) -> dice for each class in each batch f1_test_continuous = T.mean(dice_scores_test_continuous) # average over batches and classes loss_train = 1 - f1_train_continuous loss_test = 1 - f1_test_continuous ##Parameter Updates all_params = L.layers.get_all_params(output_layer_for_loss, trainable=True) learning_rate = theano.shared(np.float32(self.HP.LEARNING_RATE)) # updates = L.updates.adam(loss_train, all_params, learning_rate) updates = L.updates.adamax(loss_train, all_params, learning_rate) # Define Functions train_fn = theano.function([X_sym, y_sym], [loss_train, prediction_train, f1_train], updates=updates) # prediction_TEST, weil hier auch nicht Dropout will bei Score?? predict_fn = theano.function([X_sym, y_sym], [loss_test, prediction_test, f1_test]) get_probs = theano.function([X_sym], output_test) #Exporting variables self.learning_rate = learning_rate self.train = train_fn self.predict = predict_fn self.get_probs = get_probs # (bs, x, y, nrClasses) self.net = net self.output = output_layer_for_loss # this is used for saving weights (could probably also be simplified)
DIM = 32 GRAD_CLIP = 1. Q_LEVELS = 256 BATCH_SIZE = 20 PRINT_EVERY = 250 EPOCH = 100 OUT_DIR = '/Tmp/kumarkun/cifar10' create_folder_if_not_there(OUT_DIR) model = Model(name = "CIFAR10.pixelCNN") is_train = T.scalar() X = T.tensor4('X') # shape: (batchsize, channels, height, width) X_r = T.itensor4('X_r') X_transformed = X_r.dimshuffle(0,2,3,1) input_layer = WrapperLayer(X.dimshuffle(0,2,3,1)) # input reshaped to (batchsize, height, width,3) pixel_CNN = pixelConv( input_layer, 3, DIM, Q_LEVELS = Q_LEVELS, name = model.name + ".pxCNN", num_layers = 12, ) model.add_layer(pixel_CNN)
WIDTH)).dimshuffle( 0, 2, 3, 4, 1) else: output = lib.ops.conv2d.Conv2D('OutputConv3', input_dim=PIX_DIM, output_dim=N_CHANNELS, filter_size=1, inputs=output, mask_type=('b', N_CHANNELS), he_init=False) return output total_iters = T.iscalar('total_iters') images = T.itensor4('images') # shape: (batch size, n channels, height, width) mu, log_sigma = Encoder(images) # mu = lib.debug.print_stats('mu', mu) # log_sigma = lib.debug.print_stats('log_sigma', log_sigma) if VANILLA: latents = mu else: eps = T.cast(theano_srng.normal(mu.shape), theano.config.floatX) latents = mu + (eps * T.exp(log_sigma)) latents = T.minimum(50, latents) latents = T.maximum(-50, latents)
def create_network(self): if self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "single_direction": # NR_OF_GRADIENTS = 15 # SH-Coeff NR_OF_GRADIENTS = 9 elif self.HP.SEG_INPUT == "Peaks" and self.HP.TYPE == "combined": # NR_OF_GRADIENTS = 3 NR_OF_GRADIENTS = 3*self.HP.NR_OF_CLASSES # NR_OF_GRADIENTS = self.HP.NR_OF_CLASSES else: NR_OF_GRADIENTS = 33 print("Building network ...") print("(Model UNet)") # Lasagne Seed for Reproducibility L.random.set_rng(np.random.RandomState(1)) net = self.get_UNet(n_input_channels=NR_OF_GRADIENTS, num_output_classes=self.HP.NR_OF_CLASSES, input_dim=self.HP.INPUT_DIM, base_n_filters=self.HP.UNET_NR_FILT) output_layer_for_loss = net["output_flat"] if self.HP.LOAD_WEIGHTS: print("Loading weights ... ({})".format(join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH))) with np.load(join(self.HP.EXP_PATH, self.HP.WEIGHTS_PATH)) as f: #if both pathes are absolute and beginning of pathes are the same, join will merge the beginning param_values = [f['arr_%d' % i] for i in range(len(f.files))] L.layers.set_all_param_values(output_layer_for_loss, param_values) X_sym = T.tensor4() # y_sym = T.imatrix() # (bs*x*y, nr_of_classes) y_sym = T.itensor4() # (bs, nr_of_classes, x, y) y_sym_flat = y_sym.dimshuffle((0, 2, 3, 1)) # (bs, x, y, nr_of_classes) y_sym_flat = y_sym_flat.reshape((-1, y_sym_flat.shape[3])) # (bs*x*y, nr_of_classes) # add some weight decay # l2_loss = L.regularization.regularize_network_params(output_layer_for_loss, L.regularization.l2) * 1e-5 ##Train prediction_train = L.layers.get_output(output_layer_for_loss, X_sym, deterministic=False) loss_vec_train = L.objectives.binary_crossentropy(prediction_train, y_sym_flat) loss_vec_train = loss_vec_train.mean(axis=1) #before: (bs*x*y, nrClasses) (= elementwise binary CE), after: (bs*x*y) (= same shape as output from categorical CE) # loss_train = loss_vec_train.mean() + l2_loss loss_train = loss_vec_train.mean() ##Test prediction_test = L.layers.get_output(output_layer_for_loss, X_sym, deterministic=True) # prediction_test = L.layers.get_output(output_layer_for_loss, X_sym, deterministic=False) #for Dropout Sampling loss_vec_test = L.objectives.binary_crossentropy(prediction_test, y_sym_flat) loss_vec_test = loss_vec_test.mean(axis=1) # loss_test = loss_vec_test.mean() + l2_loss loss_test = loss_vec_test.mean() ##Parameter Updates all_params = L.layers.get_all_params(output_layer_for_loss, trainable=True) learning_rate = theano.shared(np.float32(self.HP.LEARNING_RATE)) updates = L.updates.adamax(loss_train, all_params, learning_rate) ##Convenience function output_train = L.layers.get_output(net["output"], X_sym, deterministic=False) output_test = L.layers.get_output(net["output"], X_sym, deterministic=True) #Calc F1 NEW (simpler) output_shuff_train = output_train.dimshuffle((0, 3, 1, 2)) # (bs, nrClasses x, y) dice_scores_train = theano_binary_dice_per_instance_and_class(output_shuff_train, y_sym, dim=2, first_spatial_axis=2) # (bs, nrClasses) -> dice for each class in each batch f1_train = T.mean(dice_scores_train) #average over batches and classes output_shuff_test = output_test.dimshuffle((0, 3, 1, 2)) # (bs, nrClasses x, y) dice_scores_test = theano_binary_dice_per_instance_and_class(output_shuff_test, y_sym, dim=2, first_spatial_axis=2) # (bs, nrClasses) -> dice for each class in each batch f1_test = T.mean(dice_scores_test) # average over batches and classes #Define Functions train_fn = theano.function([X_sym, y_sym], [loss_train, prediction_train, f1_train], updates=updates) # prediction_TEST, weil hier auch nicht Dropout will bei Score?? predict_fn = theano.function([X_sym, y_sym], [loss_test, prediction_test, f1_test]) get_probs = theano.function([X_sym], output_test) #Exporting variables self.learning_rate = learning_rate self.train = train_fn self.predict = predict_fn self.get_probs = get_probs # (bs, x, y, nrClasses) self.net = net self.output = output_layer_for_loss # this is used for saving weights (could probably also be simplified)
# MLP # def Discriminator(inputs): # n_samples = inputs.shape[0] # output = lib.ops.linear.Linear('Discriminator.In', 64*64*3, DIM, inputs, initialization='glorot_he') # output = T.nnet.relu(output) # output = lib.ops.linear.Linear('Discriminator.2', DIM, DIM, output, initialization='he') # output = T.nnet.relu(output) # output = lib.ops.linear.Linear('Discriminator.3', DIM, DIM, output, initialization='he') # output = T.nnet.relu(output) # output = lib.ops.linear.Linear('Discriminator.4', DIM, 1, output, initialization='he') # return output.reshape((n_samples,)) real_data_int = T.itensor4('images') real_data = (T.cast(real_data_int, 'float32')*(2./255) - 1.).reshape((-1,64*64*3)) fake_data = Generator(BATCH_SIZE) disc_out = Discriminator(T.concatenate([real_data, fake_data], axis=0)) disc_real = disc_out[:BATCH_SIZE] disc_fake = disc_out[BATCH_SIZE:] gen_cost = -T.mean(Discriminator(Generator(2*BATCH_SIZE))) disc_cost = T.mean(disc_fake) - T.mean(disc_real) alpha = srng.uniform( size=(BATCH_SIZE,1), low=0., high=1.
report(v) # 4-dimensional ndarray v = T.tensor4(name=None, dtype=T.config.floatX) report(v) # constructors with fixed data type. (examples with tensor4) # b: byte, w: word(16bit), l: int64, i: int32 # d:float64, f: float32, c: complex64, z: complex128 v = T.btensor4(name="v") report(v) v = T.wtensor4(name="v") report(v) v = T.itensor4(name="v") report(v) v = T.ltensor4(name="v") report(v) v = T.dtensor4(name="v") report(v) v = T.ftensor4(name="v") report(v) v = T.ctensor4(name="v") report(v) v = T.ztensor4(name="v")
# MLP # def Discriminator(inputs): # n_samples = inputs.shape[0] # output = lib.ops.linear.Linear('Discriminator.In', 64*64*3, DIM, inputs, initialization='glorot_he') # output = T.nnet.relu(output) # output = lib.ops.linear.Linear('Discriminator.2', DIM, DIM, output, initialization='he') # output = T.nnet.relu(output) # output = lib.ops.linear.Linear('Discriminator.3', DIM, DIM, output, initialization='he') # output = T.nnet.relu(output) # output = lib.ops.linear.Linear('Discriminator.4', DIM, 1, output, initialization='he') # return output.reshape((n_samples,)) real_data_int = T.itensor4('images') real_data = (T.cast(real_data_int, 'float32') * (2. / 255) - 1.).reshape( (-1, 64 * 64 * 3)) fake_data = Generator(BATCH_SIZE) disc_out = Discriminator(T.concatenate([real_data, fake_data], axis=0)) disc_real = disc_out[:BATCH_SIZE] disc_fake = disc_out[BATCH_SIZE:] gen_cost = -T.mean(Discriminator(Generator(2 * BATCH_SIZE))) disc_cost = T.mean(disc_fake) - T.mean(disc_real) alpha = srng.uniform(size=(BATCH_SIZE, 1), low=0., high=1.) differences = fake_data - real_data interpolates = real_data + (alpha * differences)
if __name__ == '__main__': data_test_path = 'test' save_dir = 'visualisation' seq_num = 50 n_class = 4 size = 192 # Build the network image_var = tensor5('image') image_pred_var = tensor5('image_pred') label_var = T.itensor4('label') image_seg_var = T.tensor4('image_seg') net = build_FCN_triple_branch_rnn(image_var, image_pred_var, image_seg_var, shape=(None, 1, size, size, seq_num), shape_seg=(None, 1, size, size)) #model_file = 'model/FCN_VGG16_sz192_flow_simese_rnn_shared.npz' model_file = 'model/FCN_VGG16_sz192_triple_3d_rnn_warped_tmp.npz' with np.load(model_file) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] L.set_all_param_values([net['out'], net['outs']], param_values) test_prediction = L.get_output(net['outs']) test_loc = L.get_output(net['out'], deterministic=True)
PIXEL_CNN_LAYERS = 4 LR = 2e-4 BATCH_SIZE = 100 N_CHANNELS = 1 HEIGHT = 28 WIDTH = 28 TIMES = ('iters', 10*500, 1000*500) lib.print_model_settings(locals().copy()) # inputs.shape: (batch size, n channels, height, width) if MODE=='256ary': inputs = T.itensor4('inputs') inputs_float = inputs.astype(theano.config.floatX) * lib.floatX(2./255) inputs_float -= lib.floatX(0.5) else: inputs = T.tensor4('inputs') inputs_float = inputs output = lib.ops.conv2d.Conv2D( 'InputConv', input_dim=N_CHANNELS, output_dim=DIM, filter_size=7, inputs=inputs_float, mask_type=('a', N_CHANNELS), he_init=False )
ne = 225961 de = 50 margin = 3 lr = 0.05 maxLen = 15 batchsize = 800 negative_sample_size = 1 itensor5 = T.TensorType("int32", (False,) * 5) dtype = theano.config.floatX """trainning model""" matrix_ndarray = np.random.uniform(-0.08, 0.08, (ne + 1, de)).astype(dtype) subtract = np.array([1, -1]) idxs = T.itensor4("ids") mask = itensor5("mask") emb = theano.shared(name="embeddings", value=matrix_ndarray) subset = emb[idxs] # mask subset subset_m = subset * mask x = T.sum(subset_m, axis=3) p = T.prod(x, axis=2) s = T.sum(p, axis=2) mul = theano.shared(name="mul", value=subtract) diff = T.dot(s, mul) cost = T.sum(T.maximum(0, margin - diff)) """testing model""" idxs_t = T.imatrix("ids") mask_t = T.itensor3("mask")