def run(): # params dims = 10 negrate = 1 batsize = 300 epochs = 300 #paths datafileprefix = "../../data/nycfilms/" dirfwdsuffix = "direct_forward.plustypes.ssd" # get the data and split dirfwdf = open(datafileprefix+dirfwdsuffix) datadf = readdata(dirfwdf) traind, validd, testd = datadf.split((70, 15, 15), random=True) numents = int(datadf.ix[:, 0].max())+1 print numents numrels = int(datadf.ix[:, 1].max())+1 print numrels # define model inp = Input(T.imatrix()) eemb = VectorEmbed.indim(numents).outdim(dims).Wreg(l2reg(0.00001))() remb = VectorEmbed.indim(numrels).outdim(dims).Wreg(l2reg(0.00001))() # for debugging eembd = SymTensor(T.fmatrix()) rembd = SymTensor(T.fmatrix()) dotp = SymTensor(T.fmatrix()) out = ((inp[:, 0] >> eemb >> eembd) & (inp[:, 1] >> remb >> rembd)) >> DotProduct() >> dotp >> Tanh() # for plotting purposes: relation to relation dot product (or relation-type) r2rinp = Input(T.imatrix()) rel2rel = ((r2rinp[:, 0] >> remb) & (r2rinp[:, 1] >> remb)) >> DotProduct() outtest = Output(T.fvector()) loss = (out & outtest) >> HingeLoss() trainer = Trainer\ .batsize(batsize)\ .epochs(epochs)\ .onrun(getonrun())\ .offrun(offrun)\ .offepoch(getoffepoch(out, rel2rel))\ .onbatch(getonbatch(negrate, numents, numrels))\ .optimizer(sgd(lr=1.))\ .batchtransformer(transbat) trainer\ .loss(loss)\ trainer.train(traind.values, validd.values)\ .test(testd.values) explore(eemb, remb) # functions for interactive exploration embed()
def make_node(self, x, x2, x3, x4, x5): # check that the theano version has support for __props__. # This next line looks like it has a typo, # but it's actually a way to detect the theano version # is sufficiently recent to support the use of __props__. assert hasattr(self, '_props'), "Your version of theano is too old to support __props__." x = tensor.as_tensor_variable(x) x2 = tensor.as_tensor_variable(x2) x3 = tensor.as_tensor_variable(x3) x4 = tensor.as_tensor_variable(x4) x5 = tensor.as_tensor_variable(x5) if prm.att_doc: if prm.compute_emb: td = tensor.itensor4().type() else: td = tensor.ftensor4().type() tm = tensor.ftensor3().type() else: if prm.compute_emb: td = tensor.itensor3().type() else: td = tensor.ftensor3().type() tm = tensor.fmatrix().type() return theano.Apply(self, [x,x2,x3,x4,x5], [td, tm, \ tensor.fmatrix().type(), tensor.ivector().type()])
def __init__(self, word_vec_width, batch_size, num_hidden, learning_rate=0.1): self.num_hidden = num_hidden self.learning_rate = learning_rate self.word_vec_width = word_vec_width self.batch_size = batch_size self.vocab_mat = T.fmatrix('vocab') self.word_onehot = T.fmatrix('word_onehot') b = T.fvector('b') W = T.fmatrix('W') f = 1 / (1 + T.exp(-(W * (self.word_onehot.dot(self.vocab_mat) + b)))) s = T.sum(f) self.exec_fn = theano.function( [self.word_onehot, b, W, self.vocab_mat], f, allow_input_downcast=True) self.word_onehot_c = T.fmatrix('word_onehot_c') f_c = 1 / (1 + T.exp(-(W * (self.word_onehot_c.dot(self.vocab_mat)) + b))) s_c = T.sum(f_c) J = T.largest(0, 1 - s + s_c) self.grad = theano.grad(J, [b, W, self.vocab_mat]) self.grad_fn = theano.function( [self.word_onehot, self.word_onehot_c, b, W, self.vocab_mat], self.grad, allow_input_downcast=True)
def __init__(self, input_layers, *args, **kwargs): super(LogLossObjective, self).__init__(input_layers, *args, **kwargs) self.input_systole = input_layers["systole:onehot"] self.input_diastole = input_layers["diastole:onehot"] self.target_vars["systole:onehot"] = T.fmatrix("systole_target_onehot") self.target_vars["diastole:onehot"] = T.fmatrix("diastole_target_onehot")
def test_pickle_unpickle_without_reoptimization(): mode = theano.config.mode if mode in ["DEBUG_MODE", "DebugMode"]: mode = "FAST_RUN" x1 = T.fmatrix('x1') x2 = T.fmatrix('x2') x3 = theano.shared(numpy.ones((10, 10), dtype=floatX)) x4 = theano.shared(numpy.ones((10, 10), dtype=floatX)) y = T.sum(T.sum(T.sum(x1**2 + x2) + x3) + x4) updates = OrderedDict() updates[x3] = x3 + 1 updates[x4] = x4 + 1 f = theano.function([x1, x2], y, updates=updates, mode=mode) # now pickle the compiled theano fn string_pkl = pickle.dumps(f, -1) # compute f value in1 = numpy.ones((10, 10), dtype=floatX) in2 = numpy.ones((10, 10), dtype=floatX) # test unpickle without optimization default = theano.config.reoptimize_unpickled_function try: # the default is True theano.config.reoptimize_unpickled_function = False f_ = pickle.loads(string_pkl) assert f(in1, in2) == f_(in1, in2) finally: theano.config.reoptimize_unpickled_function = default
def cmp(a_shp, b_shp): a0 = my_rand(*a_shp) a = tcn.shared_constructor(a0, 'a') b = tensor.fmatrix('b') c = tensor.fmatrix('c') f = pfunc([b, c], [], updates=[(a, tensor.dot(a, b) + tensor.exp(c))], mode=mode_with_gpu) assert any([node.op == tcn.blas.gpu_gemm_inplace for node in f.maker.fgraph.toposort()]) bval = my_rand(*b_shp) cval = my_rand(a_shp[0], b_shp[1]) f(bval, cval) assert numpy.allclose(numpy.dot(a0, bval) + numpy.exp(cval), a.get_value()) # Try with a matrix equal to a0, but with strides in both dims a.set_value(a0) a.set_value( a.get_value(borrow=True, return_internal_type=True)[::-1, ::-1], borrow=True) f(bval, cval)
def cmp(a_shp, b_shp): a = tensor.fmatrix() b = tensor.fmatrix() scalar = tensor.fscalar() av = my_rand(*a_shp) bv = my_rand(*b_shp) f = theano.function( [a, b], tensor.dot(a, b) * numpy.asarray(4, 'float32'), mode=mode_with_gpu) f2 = theano.function( [a, b], tensor.dot(a, b) * numpy.asarray(4, 'float32')) t = f.maker.fgraph.toposort() assert len(t) == 4 assert isinstance(t[0].op, tcn.GpuFromHost) assert isinstance(t[1].op, tcn.GpuFromHost) assert isinstance(t[2].op, tcn.blas.GpuDot22Scalar) assert isinstance(t[3].op, tcn.HostFromGpu) assert numpy.allclose(f(av, bv), f2(av, bv)) f = theano.function([a, b, scalar], tensor.dot(a, b) * scalar, mode=mode_with_gpu) f2 = theano.function([a, b, scalar], tensor.dot(a, b) * scalar) t = f.maker.fgraph.toposort() assert len(t) == 4 assert isinstance(t[0].op, tcn.GpuFromHost) assert isinstance(t[1].op, tcn.GpuFromHost) assert isinstance(t[2].op, tcn.blas.GpuDot22Scalar) assert isinstance(t[3].op, tcn.HostFromGpu) assert numpy.allclose(f(av, bv, 0.5), f2(av, bv, 0.5))
def test_gpujoin_gpualloc(): a = T.fmatrix('a') a_val = numpy.asarray(numpy.random.rand(4, 5), dtype='float32') b = T.fmatrix('b') b_val = numpy.asarray(numpy.random.rand(3, 5), dtype='float32') f = theano.function([a, b], T.join(0, T.zeros_like(a),T.ones_like(b)) + 4, mode=mode_without_gpu) f_gpu = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)), mode=mode_with_gpu) f_gpu2 = theano.function([a, b], T.join(0, T.zeros_like(a), T.ones_like(b)) + 4, mode=mode_with_gpu) assert sum([node.op == T.alloc for node in f.maker.env.toposort()]) == 2 assert sum([node.op == T.join for node in f.maker.env.toposort()]) == 1 assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()]) == 2 assert sum([node.op == B.gpu_join for node in f_gpu.maker.env.toposort()]) == 1 assert sum([node.op == B.gpu_alloc for node in f_gpu2.maker.env.toposort()]) == 2 assert sum([node.op == B.gpu_join for node in f_gpu2.maker.env.toposort()]) == 1 assert numpy.allclose(f(a_val, b_val), f_gpu2(a_val, b_val))
def test_local_gpu_elemwise_0(): """ Test local_gpu_elemwise_0 when there is a dtype upcastable to float32 """ a = tensor.bmatrix() b = tensor.fmatrix() c = tensor.fmatrix() a_v = (numpy.random.rand(4, 5) * 10).astype("int8") b_v = (numpy.random.rand(4, 5) * 10).astype("float32") c_v = (numpy.random.rand(4, 5) * 10).astype("float32") # Due to optimization order, this composite is created when all # the op are on the gpu. f = theano.function([a, b, c], [a + b + c], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 f(a_v, b_v, c_v) # Now test with the composite already on the cpu before we move it # to the gpu a_s = theano.scalar.int8() b_s = theano.scalar.float32() c_s = theano.scalar.float32() out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s]) out_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], [out_op(a, b, c)], mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 f(a_v, b_v, c_v)
def show_patches_on_frames(ims, locations_, scales_, image_shape=(100, 100), patch_shape=(16, 16)): hyperparameters = {} hyperparameters["cutoff"] = 3 hyperparameters["batched_window"] = True location = T.fmatrix() scale = T.fmatrix() x = T.fvector() cropper = LocallySoftRectangularCropper( patch_shape=patch_shape, hyperparameters=hyperparameters, kernel=Gaussian()) patch = cropper.apply( x.reshape((1, 1,) + image_shape), np.array([list(image_shape)]), location, scale) get_patch = theano.function([x, location, scale], patch, allow_input_downcast=True) final_shape = (image_shape[0], image_shape[0] + patch_shape[0] + 5) ret = np.ones((ims.shape[0], ) + final_shape + (3,), dtype=np.float32) for i in range(ims.shape[0]): im = ims[i] location_ = locations_[i] scale_ = scales_[i] patch_on_frame = show_patch_on_frame(im, location_, scale_) ret[i, :, :image_shape[1], :] = patch_on_frame ret[i, -patch_shape[0]:, image_shape[1] + 5:, :] = to_rgb1( get_patch(im, [location_], [scale_])[0, 0]) return ret
def test_elemwise_composite_float64(): # test that we don't fuse composite elemwise with float64 somewhere inside # nvcc by default downcast them to float32. We would need to tell him not # to do so, but that possible only on some device. a = tensor.fmatrix() b = tensor.fmatrix() av = theano._asarray(numpy.random.rand(4, 4), dtype='float32') bv = numpy.ones((4, 4), dtype='float32') def get_all_basic_scalar(composite_op): l = [] for i in composite_op.env.toposort(): if isinstance(i, theano.scalar.Composite): l += get_all_basic_scalar(i) else: l.append(i) return l for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'), mode_with_gpu.excluding('elemwise_fusion')]: f = pfunc([a, b], tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2, b), 'float32'), mode=mode) out = f(av, bv) assert numpy.all(out == ((av ** 2) < bv)) for node in f.maker.env.toposort(): if isinstance(node.op, cuda.GpuElemwise): if isinstance(node.op.scalar_op, theano.scalar.Composite): scals = get_all_basic_scalar(node.op.scalar_op) for s in scals: assert not any([i.type.dtype == 'float64' for i in s.inputs + s.outputs])
def __init__(self, name, input_neurons, output_neurons): self.input_neurons=input_neurons self.output_neurons=output_neurons self.name = name #Initialize theano variables: self.W_forget_theano = T.fmatrix(self.name + '_forget_weight') self.W_input_theano = T.fmatrix(self.name + '_input_weight') self.W_candidate_theano = T.fmatrix(self.name + '_candidate_weight') self.W_output_theano = T.fmatrix(self.name + '_output_weight') #Initialize python variables: high_init = np.sqrt(6)/np.sqrt(self.input_neurons + 2*self.output_neurons) low_init = -high_init s = (self.output_neurons, self.input_neurons + self.output_neurons + 1) self.W_forget = np.random.uniform(low=low_init, high=high_init, size=s).astype(np.float32) self.W_input = np.random.uniform(low=low_init, high=high_init, size=s).astype(np.float32) self.W_candidate = np.random.uniform(low=low_init, high=high_init, size=s).astype(np.float32) self.W_output = np.random.uniform(low=low_init, high=high_init, size=s).astype(np.float32) #Initialize forget bias to one: self.W_forget[-1] = np.ones_like(self.W_forget[-1], dtype=np.float32)
def __theano_build__(self): params = self.params param_names = self.param_names hidden_dim = self.hidden_dim x1 = T.imatrix('x1') # first sentence x2 = T.imatrix('x2') # second sentence x1_mask = T.fmatrix('x1_mask') #mask x2_mask = T.fmatrix('x2_mask') y = T.ivector('y') # label y_c = T.ivector('y_c') # class weights # Embdding words _E1 = params["E"].dot(params["W"][0]) + params["B"][0] _E2 = params["E"].dot(params["W"][1]) + params["B"][1] statex1 = _E1[x1.flatten(), :].reshape([x1.shape[0], x1.shape[1], hidden_dim]) statex2 = _E2[x2.flatten(), :].reshape([x2.shape[0], x2.shape[1], hidden_dim]) def rnn_cell(x, mx, ph, Wh): h = T.tanh(ph.dot(Wh) + x) h = mx[:, None] * h + (1-mx[:, None]) * ph return [h] [h1], updates = theano.scan( fn=rnn_cell, sequences=[statex1, x1_mask], truncate_gradient=self.truncate, outputs_info=[dict(initial=T.zeros([self.batch_size, self.hidden_dim]))], non_sequences=params["W"][2]) [h2], updates = theano.scan( fn=rnn_cell, sequences=[statex2, x2_mask], truncate_gradient=self.truncate, outputs_info=[dict(initial=h1[-1])], non_sequences=params["W"][3]) #predict _s = T.nnet.softmax(h1[-1].dot(params["lrW"][0]) + h2[-1].dot(params["lrW"][1]) + params["lrb"]) _p = T.argmax(_s, axis=1) _c = T.nnet.categorical_crossentropy(_s, y) _c = T.sum(_c * y_c) _l = T.sum(params["lrW"]**2) _cost = _c + 0.01 * _l # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # Gradients and updates _grads, _updates = rms_prop(_cost, param_names, params, learning_rate, decay) # Assign functions self.bptt = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _grads) self.loss = theano.function([x1, x2, x1_mask, x2_mask, y, y_c], _c) self.weights = theano.function([x1, x2, x1_mask, x2_mask], _s) self.predictions = theano.function([x1, x2, x1_mask, x2_mask], _p) self.sgd_step = theano.function( [x1, x2, x1_mask, x2_mask, y, y_c, learning_rate, decay], updates=_updates)
def setup_theano(self): self.vocab_mat = T.fmatrix('vocab') self.sample = T.fmatrix('sample') b = T.fvector('b') W = T.fmatrix('W') f = self.transform_function( W, b, self.wordvec_transform(self.sample, self.vocab_mat)) s = T.sum(f) self.corrupt_sample = T.fmatrix('corrupt-sample') f_corrupt = self.transform_function( W, b, self.wordvec_transform(self.corrupt_sample, self.vocab_mat)) s_corrupt = T.sum(f_corrupt) J = T.largest(0, 1 - s + s_corrupt) self.grad = theano.grad(J, [b, W, self.vocab_mat]) self.grad_fn = theano.function( [self.sample, self.corrupt_sample, b, W, self.vocab_mat], self.grad, allow_input_downcast=True) self.exec_fn = theano.function([self.sample, b, W, self.vocab_mat], f, allow_input_downcast=True)
def build_loss_graph(self, saved_graph=None): print("Building loss graph...") for l in self.layers: l.set_training(False) Sentence = T.fmatrix('Sentence') Characters = T.ftensor3('Characters') WordLengths = T.ivector('WordLengths') GoldPredictions = T.fmatrix('GoldPredictions') weight_list = self.get_theano_weight_list() if self.feature_mode == 'character': result = self.theano_sentence_loss(Characters, WordLengths, GoldPredictions) input_list = [Characters, WordLengths, GoldPredictions] + list(weight_list) elif self.feature_mode == 'sentence': result = self.theano_sentence_loss(Sentence, GoldPredictions) input_list = [Sentence, GoldPredictions] + list(weight_list) elif self.feature_mode == 'both': result = self.theano_sentence_loss(Sentence, Characters, WordLengths, GoldPredictions) input_list = [Sentence, Characters, WordLengths, GoldPredictions] + list(weight_list) cgraph = theano.function(inputs=input_list, outputs=result, mode='FAST_RUN', allow_input_downcast=True) print("Done building graph.") return cgraph
def _training_DNN(self): trX, trY, self.missing_filename_list, = read_features(self.test_number, self.n_input_f, self.n_output_f) trX = trX[:,1:self.n_input_f] trY = trY[:,1:self.n_output_f] print trX.shape print trY.shape print self.nloop, self.n_hidden_layer, self.n_input_f, self.n_hidden_f, self.n_output_f X = T.fmatrix() Y = T.fmatrix() py_x = self._model(X, self.params, self.bias) y_x = py_x cost = T.mean(T.sqr(py_x - Y)) updates = self._sgd(cost, self.params, self.bias) train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True) self.predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True) for i in range(self.nloop, self.nloop + 0 ): print i #logging.debug('loop' + str(i)) error_total = 0 arr_X_Y = zip(range(0, len(trX), 128), range(128, len(trX), 128)) for start, end in arr_X_Y: cost = train(trX[start:end], trY[start:end]) error_total += cost #print cost last_element = arr_X_Y[len(arr_X_Y)-1][0] if last_element < len(trX): cost = train(trX[last_element: len(trX)], trY[last_element:len(trY)]) error_total += cost print error_total / len(trX) save_weight_info( self.filename, i, self.n_hidden_layer, self.n_input_f, self.n_hidden_f, self.n_output_f, self.params, error_total, self.bias) self.id_file = 1 - self.id_file self.filename = self.weight_folder + 'id_' + str(self.id_file) + ".txt"
def test_graph_opt_caching(): opt_db_file = os.path.join(theano.config.compiledir, 'optimized_graphs.pkl') if os.path.exists(opt_db_file): os.remove(opt_db_file) mode = theano.config.mode if mode in ["DEBUG_MODE", "DebugMode"]: mode = "FAST_RUN" default = theano.config.cache_optimizations try: theano.config.cache_optimizations = True a = T.fmatrix('a') b = T.fmatrix('b') c = theano.shared(np.ones((10, 10), dtype=floatX)) d = theano.shared(np.ones((10, 10), dtype=floatX)) e = T.sum(T.sum(T.sum(a ** 2 + b) + c) + d) f1 = theano.function([a, b], e, mode=mode) m = T.fmatrix('x1') n = T.fmatrix('x2') p = theano.shared(np.ones((10, 10), dtype=floatX)) q = theano.shared(np.ones((10, 10), dtype=floatX)) j = T.sum(T.sum(T.sum(m ** 2 + n) + p) + q) f2 = theano.function([m, n], j, mode=mode) in1 = np.ones((10, 10), dtype=floatX) in2 = np.ones((10, 10), dtype=floatX) assert f1(in1, in2) == f2(in1, in2) finally: theano.config.cache_optimizations = default
def multiclass_logistic_regr(mnist): def floatX(X): return np.asarray(X, dtype=theano.config.floatX) def init_weights(shape): return theano.shared(floatX(np.random.randn(*shape)*0.01)) def model(X, w): return T.nnet.softmax(T.dot(X, w)) # each image is 28x28 # trX: 60,000x784 # trY: 60,000x10 trX, teX, trY, teY = mnist(onehot=True) X = T.fmatrix() Y = T.fmatrix() w = init_weights([784, 10]) py_x = model(X, w) y_pred = T.argmax(py_x, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y)) gradient = T.grad(cost=cost, wrt=w) update = [[w, w - 0.05*gradient]] train = theano.function(inputs=[X, Y], output=cost, updates=update, allow_input_downcast=True) predict = theano.function(inputs=X, output=y_pred, allow_input_downcast=True) mbsize = 128 for start, end in zip(xrange(0, len(trX), mbsize), xrange(mbsize, len(trX), mbsize)): c = train(trX[start:end], trY[start:end]) print i, np.mean(np.argmax(teY, axis=1) == predict(teX))
def build_ann(self, number_of_input_nodes, no, nr_of_hidden_layers, nr_of_nodes_in_layers, act_functions): weights = [] a = theano.shared(np.random.uniform(low=-.1, high=.1, size=(number_of_input_nodes, nr_of_nodes_in_layers[0]))) weights.append(a) for i in range(1, nr_of_hidden_layers): weights.append(theano.shared(np.random.uniform(low=-.1, high=.1, size=(nr_of_nodes_in_layers[i-1], nr_of_nodes_in_layers[i])))) weights.append(theano.shared(np.random.uniform(low=-.1, high=.1, size=(nr_of_nodes_in_layers[-1], no)))) input = T.fmatrix() target = T.fmatrix() layers = [] # First hidden layer self.add_layer_activation_function(act_functions[0], layers, input, weights[0]) # Next layers for j in range(nr_of_hidden_layers): self.add_layer_activation_function(act_functions[j+1], layers, layers[j], weights[j+1]) error = T.sum(pow((target - layers[-1]), 2)) # Sum of squared errors params = [w for w in weights] gradients = T.grad(error, params) backprops = self.backprop_acts(params, gradients) #self.get_x1 = theano.function(inputs=[input, target], outputs=error, allow_input_downcast=True) self.trainer = theano.function(inputs=[input, target], outputs=error, updates=backprops, allow_input_downcast=True) self.predictor = theano.function(inputs=[input], outputs=layers[-1], allow_input_downcast=True)
def create_encoder_decoder_func(layers, apply_updates=False): X = T.fmatrix('X') X_batch = T.fmatrix('X_batch') X_hat = get_output(layers['l_decoder_out'], X, deterministic=False) # reconstruction loss encoder_decoder_loss = T.mean( T.mean(T.sqr(X - X_hat), axis=1) ) if apply_updates: # all layers that participate in the forward pass should be updated encoder_decoder_params = get_all_params( layers['l_decoder_out'], trainable=True) encoder_decoder_updates = nesterov_momentum( encoder_decoder_loss, encoder_decoder_params, 0.01, 0.9) else: encoder_decoder_updates = None encoder_decoder_func = theano.function( inputs=[theano.In(X_batch)], outputs=encoder_decoder_loss, updates=encoder_decoder_updates, givens={ X: X_batch, }, ) return encoder_decoder_func
def main_train(): trX, teX, trY, teY = mnist(onehot=True) X = T.fmatrix() Y = T.fmatrix() w_h = init_weights((784, 625)) w_h2 = init_weights((625, 625)) w_o = init_weights((625, 10)) params = [w_h, w_h2, w_o] noise_h, noise_h2, noise_py_x = model(X, params, 0.2, 0.5) h, h2, py_x = model(X, params, 0., 0.) y_x = T.argmax(py_x, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y)) updates = RMSprop(cost, params, lr=0.001) train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True) predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True) for i in range(100): for start, end in zip(range(0, len(trX), 128), range(128, len(trX), 128)): cost = train(trX[start:end], trY[start:end]) print np.mean(np.argmax(teY, axis=1) == predict(teX)) if i % 10 == 0: name = 'media/model/modnet-{0}.model'.format(str(i)) save_model(name, params) name = 'media/model/modnet-final.model' save_model(name, params)
def __init__(self, dimX, dimZ, hls, acts): self.dimZ = dimZ self.f = MLP(dimX, dimZ, [1200], [tanh, tanh]) self.g = MLP(dimZ, dimX, [1200], [tanh, sigm]) self.generator = MLP(dimZ, dimX, [1200, 1200], [tanh, tanh, sigm]) self.params = self.f.params + self.g.params + self.generator.params x = T.fmatrix('x') lr = T.scalar('lr') noise = T.scalar('noise') z = self.f(2*x-1) rx = self.g(z) cost_recons = ce(rx, x).mean(axis=1).mean(axis=0) rand = rng_theano.uniform(low=0, high=1, size=z.shape) nz = self.nearest_neighbour_of_in(rand, z) # nn of rand in z xnz = self.g(nz) rxx = self.generator(rand) cost_gen = ce(rxx, xnz).mean(axis=1).mean(axis=0) grads_f = T.grad(cost_recons, self.f.params) grads_g = T.grad(cost_recons, self.g.params) grads_gen = T.grad(cost_gen, self.generator.params) grads = grads_f + grads_g + grads_gen updates = map(lambda (param, grad): (param, param - lr * grad), zip(self.params, grads)) nnd = self.nearest_neighbour_distances(z) self.train_fn = theano.function([x, lr], [cost_recons, cost_gen, nnd.mean(), nnd.std()], updates=updates) z = T.fmatrix('z') self.sample_fn = theano.function([z], self.g(z), allow_input_downcast=True) self.infer_fn = theano.function([x], self.f(2*x-1), allow_input_downcast=True) self.generator_fn = theano.function([z], self.g(z), allow_input_downcast=True)
def test_does_not_crash(): Z = T.ftensor3('Z') W_re = T.fmatrix('W_re') W_att_in = T.fmatrix('W_att_in') c = T.fmatrix('c') #initial state y0 = T.fmatrix('y0') #initial activation i = T.matrix('i',dtype='int8') Y, H, d = LSTMCustomTestOpNoInplaceInstance(Z, c, y0, i, W_re, W_att_in) f = theano.function(inputs=[Z, c, y0, i, W_re, W_att_in], outputs=Y) n_T = 5 n_batch = 4 n_inp_dim = 3 n_cells = 8 numpy.random.seed(1234) Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32') W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') #i_val = numpy.ones((n_T, n_batch), dtype='int8') i_val = numpy.array([[1,1,1,1,1], [0,0,1,1,1], [0,0,1,1,1], [0,0,1,0,0]], dtype='int8').T Y_val = numpy.asarray(f(Z_val, c_val, y0_val, i_val, W_re_val, W_att_in_val)) #print Y_val print("success")
def predict_df(self, input_df = None ): f = open('/tmp/obj.save', 'rb') neural_model = cPickle.load(f) f.close() X, y = neural_model['enc'].transform(input_df) # X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42) trX, teX, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42) trY = one_hot(Y_train, n=2) teY = one_hot(Y_test, n=2) X = T.fmatrix() Y = T.fmatrix() h, h2, py_x = model(X, neural_model['w_h'], neural_model['w_h2'], neural_model['w_o'], 0., 0.) y_pred = T.argmax(py_x, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y)) gradient = T.grad(cost=cost, wrt=w) update = [[w, w - gradient * 0.05]] train = theano.function(inputs=[X, Y], outputs=cost, updates=update, allow_input_downcast=True) predict = theano.function(inputs=[X], outputs=y_pred, allow_input_downcast=True) print('Loaded precision:' , np.mean(np.argmax(teY, axis=1) == predict(teX))) return predict(teX)
def train(self, trX, teX, trY, teY, plot=True, epochs=TIMES, shortcard=SHORTCARD, speed=SPEED, drop_input=DROP_INPUT, drop_hidden=DROP_HIDDEN, step_show=STEP_SHOW, rho=RHO, epsilon=EPSILON): X = T.fmatrix() Y = T.fmatrix() train_set_n = len(trY) test_set_n = len(teY) accuracy_arr = [] diff_arr = [] i_arr = [] noise_py_x = self._model(X, drop_input, drop_hidden) cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y)) updates = self._RMSprop(cost, lr=speed, rho=rho, epsilon=epsilon) train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True) for i in range(TIMES): for start, end in zip(range(0, train_set_n, shortcard), range(shortcard, train_set_n, shortcard)): cost = train(trX[start:end], trY[start:end]) if i % step_show == 0: acc = np.mean(np.argmax(teY, axis=1) == self.predict(teX)) accuracy_arr.append(acc) di = self.get_diff(teX, teY) diff_arr.append(di) i_arr.append(i) print "{0} {1:.3f}% {2:.1f}".format(i, acc * 100, di) if plot: self._name = "Epochs: {0}, Shortcard: {1}, Speed: {2:.5f}\n Structure: {3}\n Train: {4}, Test: {5}".format(epochs, shortcard, speed, self._struct, train_set_n, test_set_n) self._name_f = "epochs_{0}_shortcard_{1}_speed_{2:.5f}_structure_{3}_train_{4}_test_{5}".format(epochs, shortcard, speed, self._struct, train_set_n, test_set_n) self._plot(i_arr, accuracy_arr, diff_arr)
def get_adadelta_trainer(self, debug=False): batch_x1 = T.fmatrix('batch_x1') batch_x2 = T.fmatrix('batch_x2') batch_y = T.ivector('batch_y') # compute the gradients with respect to the model parameters cost = self.cost gparams = T.grad(cost, self.params) # compute list of weights updates updates = OrderedDict() for accugrad, accudelta, param, gparam in zip(self._accugrads, self._accudeltas, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx updates[param] = param + dx updates[accugrad] = agrad outputs = cost if debug: outputs = [cost] + self.params + gparams +\ [updates[param] for param in self.params] train_fn = theano.function(inputs=[theano.Param(batch_x1), theano.Param(batch_x2), theano.Param(batch_y)], outputs=outputs, updates=updates, givens={self.x1: batch_x1, self.x2: batch_x2, self.y: batch_y}) return train_fn
def test_pycuda_elemwise_kernel(): x=T.fmatrix('x') y=T.fmatrix('y') f=theano.function([x,y],x+y, mode=mode_with_gpu) print f.maker.env.toposort() f2 = theano.function([x,y],x+y, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")) print f2.maker.env.toposort() assert any([ isinstance(node.op, theano.sandbox.cuda.GpuElemwise) for node in f.maker.env.toposort()]) assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f2.maker.env.toposort()]) val1 = numpy.asarray(numpy.random.rand(5,5), dtype='float32') val2 = numpy.asarray(numpy.random.rand(5,5), dtype='float32') #val1 = numpy.ones((5,5)) #val2 = numpy.arange(25).reshape(5,5) assert (f(val1,val2) == f2(val1,val2)).all() print f(val1,val2) print f2(val1,val2) x3=T.ftensor3('x') y3=T.ftensor3('y') z3=T.ftensor3('y') f4 = theano.function([x3,y3,z3],x3*y3+z3, mode=mode_with_gpu.including("local_pycuda_gpu_elemwise_kernel")) print f4.maker.env.toposort() assert any([ isinstance(node.op, PycudaElemwiseKernelOp) for node in f4.maker.env.toposort()]) val1 = numpy.random.rand(2,2,2) print val1 print f4(val1,val1,val1) assert numpy.allclose(f4(val1,val1,val1),val1*val1+val1)
def test_fwd_pass_compatible_with_OpLSTM(): Z = T.ftensor3('Z') W_re = T.fmatrix('W_re') W_att_in = T.fmatrix('W_att_in') c = T.fmatrix('c') #initial state y0 = T.fmatrix('y0') #initial activation i = T.matrix('i',dtype='int8') Y, H, d = LSTMCustomTestOpNoInplaceInstance(Z, c, y0, i, W_re, W_att_in) W_re_modified = W_re + W_att_in Z_modified = T.inc_subtensor(Z[0], T.dot(y0,W_re_modified)) Y2, H2, d2 = LSTMOpInstance(Z_modified, W_re_modified, c, i) f = theano.function(inputs=[Z, c, y0, i, W_re, W_att_in], outputs=Y) g = theano.function(inputs=[Z, W_re, c, y0, i, W_att_in], outputs=Y2) n_T = 5 n_batch = 4 n_inp_dim = 3 n_cells = 8 numpy.random.seed(1234) Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32') W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32') c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32') #i_val = numpy.ones((n_T, n_batch), dtype='int8') i_val = numpy.array([[1,1,1,1,1], [0,0,1,1,1], [0,0,1,1,1], [0,0,1,0,0]], dtype='int8').T Y_val = numpy.asarray(f(Z_val, c_val, y0_val, i_val, W_re_val, W_att_in_val)) Y2_val = numpy.asarray(g(Z_val, W_re_val, c_val, y0_val, i_val, W_att_in_val)) assert numpy.allclose(Y_val, Y2_val) print("success")
def __init__(self, input_layers, *args, **kwargs): super(KaggleObjective, self).__init__(input_layers, *args, **kwargs) self.input_systole = input_layers["systole"] self.input_diastole = input_layers["diastole"] self.target_vars["systole"] = T.fmatrix("systole_target") self.target_vars["diastole"] = T.fmatrix("diastole_target")
def get_SGD_trainer(self, debug=False): """ Returns a plain SGD minibatch trainer with learning rate as param. """ batch_x1 = T.fmatrix('batch_x1') batch_x2 = T.fmatrix('batch_x2') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use # compute the gradients with respect to the model parameters # using mean_cost so that the learning rate is not too dependent on the batch size cost = self.mean_cos_sim_cost gparams = T.grad(cost, self.params) # compute list of weights updates updates = OrderedDict() for param, gparam in zip(self.params, gparams): updates[param] = param - gparam * learning_rate outputs = cost if debug: outputs = [cost] + self.params + gparams +\ [updates[param] for param in self.params] train_fn = theano.function(inputs=[theano.Param(batch_x1), theano.Param(batch_x2), theano.Param(batch_y), theano.Param(learning_rate)], outputs=outputs, updates=updates, givens={self.x1: batch_x1, self.x2: batch_x2, self.y: batch_y}) return train_fn
def __init__( self, Nbranches=1, # number of branches (parallel models to be fused) Nlayers=1, # number of layers Ndirs=1, # unidirectional or bidirectional Nx=100, # input size Nh=100, # hidden layer size Ny=100, # output size Ah="relu", # hidden unit activation (e.g. relu, tanh, lstm) Ay="linear", # output unit activation (e.g. linear, sigmoid, softmax) predictPer="frame", # frame or sequence loss=None, # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge) L1reg=0.0, # L1 regularization L2reg=0.0, # L2 regularization multiReg=0.0, # regularization of agreement of predictions on data of different conditions momentum=0.0, # SGD momentum seed=15213, # random seed for initializing the weights frontEnd=None, # a lambda function for transforming the input filename=None, # initialize from file initParams=None, # initialize from given dict ): if filename is not None: # load parameters from file with smart_open(filename, "rb") as f: initParams = dill.load(f) if initParams is not None: # load parameters from given dict self.paramNames = [] self.params = [] for k, v in initParams.iteritems(): if type(v) is numpy.ndarray: self.addParam(k, v) else: setattr(self, k, v) self.paramNames.append(k) # F*ck, locals()[k] = v doesn't work; I have to do this statically Nbranches, Nlayers, Ndirs, Nx, Nh, Ny, Ah, Ay, predictPer, loss, L1reg, L2reg, momentum, frontEnd \ = self.Nbranches, self.Nlayers, self.Ndirs, self.Nx, self.Nh, self.Ny, self.Ah, self.Ay, self.predictPer, self.loss, self.L1reg, self.L2reg, self.momentum, self.frontEnd else: # Initialize parameters randomly # Names of parameters to save to file self.paramNames = [ "Nbranches", "Nlayers", "Ndirs", "Nx", "Nh", "Ny", "Ah", "Ay", "predictPer", "loss", "L1reg", "L2reg", "momentum", "frontEnd" ] for name in self.paramNames: value = locals()[name] setattr(self, name, value) # Values of parameters for building the computational graph self.params = [] # Initialize random number generators global rng rng = numpy.random.RandomState(seed) # Construct parameter matrices Nlstm = 4 if Ah == 'lstm' else 1 self.addParam("Win", rand_init((Nbranches, Nx, Nh * Ndirs * Nlstm), Ah)) self.addParam( "Wrec", rand_init((Nbranches, Nlayers, Ndirs, Nh, Nh * Nlstm), Ah)) self.addParam( "Wup", rand_init( (Nbranches, Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm), Ah)) self.addParam("Wout", rand_init((Nbranches, Nh * Ndirs, Ny), Ay)) if Ah != "lstm": self.addParam("Bhid", zeros((Nbranches, Nlayers, Nh * Ndirs))) else: self.addParam( "Bhid", numpy.tile( numpy.concatenate([ full((Nbranches, Nlayers, Nh), 1.0), zeros((Nbranches, Nlayers, Nh * 3)) ], 2), (1, 1, Ndirs))) self.addParam("Bout", zeros((Nbranches, Ny))) self.addParam("h0", zeros((Nbranches, Nlayers, Ndirs, Nh))) if Ah == "lstm": self.addParam("c0", zeros((Nbranches, Nlayers, Ndirs, Nh))) # Compute total number of parameters self.nParams = sum(x.get_value().size for x in self.params) # Initialize gradient tensors when using momentum if momentum > 0: self.dparams = [ theano.shared(zeros(x.get_value().shape)) for x in self.params ] # Build computation graph input = T.ftensor3() mask = T.imatrix() mask_int = [(mask % 2).nonzero(), (mask >= 2).nonzero()] mask_float = [ T.cast((mask % 2).dimshuffle((1, 0)).reshape( (mask.shape[1], mask.shape[0], 1)), theano.config.floatX), T.cast((mask >= 2).dimshuffle((1, 0)).reshape( (mask.shape[1], mask.shape[0], 1)), theano.config.floatX) ] # mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()] # mask_float = [T.cast((mask & 1).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX), # T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)] def step_rnn(x_t, mask, h_tm1, W, h0): h_tm1 = T.switch(mask, h0, h_tm1) return [ACTIVATION[Ah](x_t + h_tm1.dot(W))] def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0): c_tm1 = T.switch(mask, c0, c_tm1) h_tm1 = T.switch(mask, h0, h_tm1) a = x_t + h_tm1.dot(W) f_t = T.nnet.sigmoid(a[:, :Nh]) i_t = T.nnet.sigmoid(a[:, Nh:Nh * 2]) o_t = T.nnet.sigmoid(a[:, Nh * 2:Nh * 3]) c_t = T.tanh(a[:, Nh * 3:]) * i_t + c_tm1 * f_t h_t = T.tanh(c_t) * o_t return [c_t, h_t] x = input if frontEnd is None else frontEnd(input) outputs = [] for k in range(Nbranches): for i in range(Nlayers): h = (x.dimshuffle((1, 0, 2)).dot(self.Win[k]) if i == 0 else h.dot(self.Wup[k, i - 1])) + self.Bhid[k, i] rep = lambda x: T.extra_ops.repeat( x.reshape((1, -1)), h.shape[1], axis=0) if Ah != "lstm": h = T.concatenate([ theano.scan( fn=step_rnn, sequences=[ h[:, :, Nh * d:Nh * (d + 1)], mask_float[d] ], outputs_info=[rep(self.h0[k, i, d])], non_sequences=[ self.Wrec[k, i, d], rep(self.h0[k, i, d]) ], go_backwards=(d == 1), )[0][::(1 if d == 0 else -1)] for d in range(Ndirs) ], axis=2) else: h = T.concatenate([ theano.scan( fn=step_lstm, sequences=[ h[:, :, Nh * 4 * d:Nh * 4 * (d + 1)], mask_float[d] ], outputs_info=[ rep(self.c0[k, i, d]), rep(self.h0[k, i, d]) ], non_sequences=[ self.Wrec[k, i, d], rep(self.c0[k, i, d]), rep(self.h0[k, i, d]) ], go_backwards=(d == 1), )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs) ], axis=2) h = h.dimshuffle((1, 0, 2)) if predictPer == "sequence": h = T.concatenate([ h[mask_int[1 - d]][:, Nh * d:Nh * (d + 1)] for d in range(Ndirs) ], axis=1) outputs.append(ACTIVATION[Ay](h.dot(self.Wout[k]) + self.Bout[k])) output = T.stack( *outputs) # Deprecated in Theano 0.8 but accepted in Theano 0.7 output_mean = output.mean(axis=0) output_var = output.var(axis=0) # Compute loss function if loss is None: loss = { "linear": "mse", "sigmoid": "ce", "softmax": "ce_group" }[self.Ay] if loss == "ctc": label = T.imatrix() label_time = T.imatrix() tol = T.iscalar() cost = sum( ctc_cost(prob, mask, label, label_time, tol) for prob in outputs) / Nbranches else: if predictPer == "sequence": label = T.fmatrix() y = output_mean t = label elif predictPer == "frame": label = T.ftensor3() indices = (mask >= 0).nonzero() y = output_mean[indices] t = label[indices] cost = T.mean({ "ce": -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis=1), "ce_group": -T.log((y * t).sum(axis=1)), "mse": T.mean((y - t)**2, axis=1), "hinge": T.mean(relu(1 - y * (t * 2 - 1)), axis=1), "squared_hinge": T.mean(relu(1 - y * (t * 2 - 1))**2, axis=1), }[loss]) # Add regularization cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg if predictPer == "sequence": cost += output_var.mean() * multiReg else: indices = (mask >= 0).nonzero() cost += output_var[indices].mean() * multiReg # Compute updates for network parameters updates = [] lrate = T.fscalar() clip = T.fscalar() grad = T.grad(cost, self.params) grad_clipped = [T.maximum(T.minimum(g, clip), -clip) for g in grad] if momentum > 0: for w, d, g in zip(self.params, self.dparams, grad_clipped): updates.append( (w, w + momentum * momentum * d - (1 + momentum) * lrate * g)) updates.append((d, momentum * d - lrate * g)) else: for w, g in zip(self.params, grad_clipped): updates.append((w, w - lrate * g)) # Create functions to be called from outside if loss == "ctc": inputs = [input, mask, label, label_time, tol, lrate, clip] else: inputs = [input, mask, label, lrate, clip] self.train = theano.function( inputs=inputs, outputs=cost, updates=updates, ) self.predict = theano.function(inputs=[input, mask], outputs=output)
def build_encoder_z(li, nc, num_hidden, lr): z_var = T.fmatrix('z_var') input_var = T.tensor4('inputs') encoder = {} details = [['Layer Name', 'Dims in', 'shape of layer', 'Dims out']] input_shape = (None, nc, li, li) name = 'input' encoder[name] = lasagne.layers.InputLayer(shape=input_shape, input_var=input_var) output_dims = input_shape filter_size = 5 num_filters = li / 4 repeat_num = int(np.log2(np.array(li)) - 3) + 1 for n in range(0, repeat_num): num_filters = num_filters * 2 prev_name = name name = 'conv' + str(n) prev_num_filters = lasagne.layers.get_output_shape( encoder[prev_name])[1] encoder[name] = lasagne.layers.batch_norm( lasagne.layers.Conv2DLayer( encoder[prev_name], num_filters, filter_size, stride=2, pad='same', nonlinearity=lasagne.nonlinearities.rectify)) prev_output_dims = output_dims output_dims = lasagne.layers.get_output_shape(encoder[name]) details.append([ name, str(prev_output_dims), str((num_filters, prev_num_filters, filter_size, filter_size)), str(output_dims) ]) prev_name = name name = 'fc' num_units = int(li * li) encoder[name] = lasagne.layers.DenseLayer( encoder[prev_name], num_units=num_units, nonlinearity=lasagne.nonlinearities.rectify) prev_output_dims = output_dims output_dims = lasagne.layers.get_output_shape(encoder[name]) details.append([ name, str(prev_output_dims), str((product(prev_output_dims[1:]), num_units)), str(output_dims) ]) prev_name = name name = 'out' num_units = num_hidden # We restrict output to tanh domain (same as input noise) encoder[name] = lasagne.layers.DenseLayer( encoder[prev_name], num_units=num_units, nonlinearity=lasagne.nonlinearities.tanh) prev_output_dims = output_dims output_dims = lasagne.layers.get_output_shape(encoder[name]) details.append([ name, str(prev_output_dims), str((product(prev_output_dims[1:]), num_units)), str(output_dims) ]) train_out = lasagne.layers.get_output(encoder['out']) val_out = lasagne.layers.get_output(encoder['out'], deterministic=True) loss = lasagne.objectives.squared_error(train_out, z_var).mean() params = lasagne.layers.get_all_params(encoder['out'], trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=lr, beta1=0.5) train_fn = theano.function([input_var, z_var], [loss], updates=updates) val_fn = theano.function([input_var], [val_out]) try: from tabulate import tabulate print(tabulate(details)) except ImportError: pass return encoder, train_fn, val_fn
def __init__(self, We_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards=True) l_reshapef = lasagne.layers.ReshapeLayer(l_lstm_wordf, (-1, hidden)) l_reshapeb = lasagne.layers.ReshapeLayer(l_lstm_wordb, (-1, hidden)) concat2 = lasagne.layers.ConcatLayer([l_reshapef, l_reshapeb]) l_local = lasagne.layers.DenseLayer( concat2, num_units=25, nonlinearity=lasagne.nonlinearities.linear) ### the above is for the uniary term energy if params.emb == 1: f = open('F.pickle') else: f = open('F0_new.pickle') para = pickle.load(f) f.close() f_params = lasagne.layers.get_all_params(l_local, trainable=True) for idx, p in enumerate(f_params): p.set_value(para[idx]) Wyy0 = np.random.uniform(-0.02, 0.02, (26, 26)).astype('float32') Wyy = theano.shared(Wyy0) d_params = lasagne.layers.get_all_params(l_local, trainable=True) d_params.append(Wyy) self.d_params = d_params l_in_word_a = lasagne.layers.InputLayer((None, None)) l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word_a = lasagne_embedding_layer_2(l_in_word_a, embsize, l_emb_word.W) #l_emb_word_a = lasagne.layers.EmbeddingLayer(l_in_word_a, input_size=We_initial.shape[0] , output_size = embsize, W =We) if params.dropout: l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5) l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden, mask_input=l_mask_word_a) l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden, mask_input=l_mask_word_a, backwards=True) l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a, (-1, hidden)) l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a, (-1, hidden)) concat2_a = lasagne.layers.ConcatLayer([l_reshapef_a, l_reshapeb_a]) if params.dropout: concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5) l_local_a = lasagne.layers.DenseLayer( concat2_a, num_units=25, nonlinearity=lasagne.nonlinearities.softmax) a_params = lasagne.layers.get_all_params(l_local_a, trainable=True) self.a_params = a_params if params.emb == 1: f = open('F.pickle') else: f = open('F0_new.pickle') PARA = pickle.load(f) f.close() for idx, p in enumerate(a_params): p.set_value(PARA[idx]) y_in = T.ftensor3() y = T.imatrix() g = T.imatrix() gmask = T.fmatrix() y_mask = T.fmatrix() length = T.iscalar() predy0 = lasagne.layers.get_output(l_local_a, { l_in_word_a: g, l_mask_word_a: gmask }) predy = predy0.reshape((-1, length, 25)) #predy = predy * gmask[:,:,None] #newpredy = T.concatenate([predy, y0] , axis=2) # n , L, 46, 46 # predy0: n, L, 25 # energy loss def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy = tg_energy + T.sum(new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy, tg_energy) return [targets_one_step, new_ta_energy] # Input should be provided as (n_batch, n_time_steps, num_labels, num_labels) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, num_labels, num_labels) local_energy = lasagne.layers.get_output(l_local, { l_in_word: g, l_mask_word: gmask }) local_energy = local_energy.reshape((-1, length, 25)) local_energy = local_energy * gmask[:, :, None] targets_shuffled = y_in.dimshuffle(1, 0, 2) masks_shuffled = gmask.dimshuffle(1, 0) # initials should be energies_shuffles[0, :, -1, :] target_time0 = targets_shuffled[0] initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) length_index = T.sum(gmask, axis=1) - 1 length_index = T.cast(length_index, 'int32') initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) pos_end_target = y_in[T.arange(length_index.shape[0]), length_index] pos_cost = target_energies[-1] + T.sum( T.sum(local_energy * y_in, axis=2) * gmask, axis=1) + T.dot( pos_end_target, Wyy[:-1, -1]) check = T.sum(T.sum(local_energy * y_in, axis=2) * gmask, axis=1) negtargets_shuffled = predy.dimshuffle(1, 0, 2) negtarget_time0 = negtargets_shuffled[0] neginitial_energy0 = T.dot(negtarget_time0, Wyy[-1, :-1]) neginitials = [negtarget_time0, neginitial_energy0] [_, negtarget_energies], _ = theano.scan( fn=inner_function, outputs_info=neginitials, sequences=[negtargets_shuffled[1:], masks_shuffled[1:]]) neg_end_target = predy[T.arange(length_index.shape[0]), length_index] neg_cost = negtarget_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * gmask, axis=1) + T.dot( neg_end_target, Wyy[:-1, -1]) y_f = y.flatten() predy_f = predy.reshape((-1, 25)) ce_hinge = lasagne.objectives.categorical_crossentropy( predy_f + eps, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * gmask, axis=1) entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * gmask, axis=1) delta0 = T.sum(abs((y_in - predy)), axis=2) * gmask delta0 = T.sum(delta0, axis=1) if (params.margin_type == 1): hinge_cost = 1 + neg_cost - pos_cost elif (params.margin_type == 2): hinge_cost = neg_cost - pos_cost elif (params.margin_type == 0): hinge_cost = delta0 + neg_cost - pos_cost elif (params.margin_type == 3): hinge_cost = delta0 * (1.0 + neg_cost - pos_cost) hinge_cost = hinge_cost * T.gt(hinge_cost, 0) d_cost = T.mean(hinge_cost) d_cost0 = d_cost l2_term = sum( lasagne.regularization.l2(x - PARA[index]) for index, x in enumerate(a_params)) """select different regulizer""" g_cost = -d_cost0 + params.l2 * sum( lasagne.regularization.l2(x) for x in a_params) + params.l3 * T.mean(ce_hinge) d_cost = d_cost0 + params.l2 * sum( lasagne.regularization.l2(x) for x in d_params) self.a_params = a_params updates_g = lasagne.updates.sgd(g_cost, a_params, params.eta) updates_g = lasagne.updates.apply_momentum(updates_g, a_params, momentum=0.9) self.train_g = theano.function( [g, gmask, y, y_in, length], [g_cost, d_cost0, pos_cost, neg_cost, delta0, check], updates=updates_g, on_unused_input='ignore') updates_d = lasagne.updates.adam(d_cost, d_params, 0.001) self.train_d = theano.function( [g, gmask, y, y_in, length], [d_cost, d_cost0, pos_cost, neg_cost, delta0, check], updates=updates_d, on_unused_input='ignore') # test the model and retuning the model predy_test = lasagne.layers.get_output(l_local_a, { l_in_word_a: g, l_mask_word_a: gmask }, deterministic=True) predy_test = predy_test.reshape((-1, length, 25)) pred = T.argmax(predy_test, axis=2) pg = T.eq(pred, y) pg = pg * gmask acc = 1.0 * T.sum(pg) / T.sum(gmask) negtargets_shuffled_test = predy_test.dimshuffle(1, 0, 2) negtarget_time0_test = negtargets_shuffled_test[0] neginitial_energy0_test = T.dot(negtarget_time0_test, Wyy[-1, :-1]) neginitials_test = [negtarget_time0_test, neginitial_energy0_test] [_, negtarget_energies_test], _ = theano.scan( fn=inner_function, outputs_info=neginitials_test, sequences=[negtargets_shuffled_test[1:], masks_shuffled[1:]]) end_test_target = predy_test[T.arange(length_index.shape[0]), length_index] neg_cost_test = negtarget_energies_test[-1] + T.sum( T.sum(local_energy * predy_test, axis=2) * gmask, axis=1) + T.dot( end_test_target, Wyy[:-1, -1]) """ce regulizer""" test_cost = -T.mean(neg_cost_test) + params.l3 * T.mean(ce_hinge) test_updates = lasagne.updates.sgd(test_cost, a_params, params.eta) test_updates = lasagne.updates.apply_momentum(test_updates, a_params, momentum=0.9) self.test_time_turning = theano.function([g, gmask, y, length], test_cost, updates=test_updates, on_unused_input='ignore') self.test_time1 = theano.function([g, gmask, y, y_in, length], [ acc, T.mean(neg_cost), T.mean(pos_cost), params.l3 * T.mean(ce_hinge) ], on_unused_input='ignore') self.test_time = theano.function([g, gmask, y, length], acc)
predict_data = predict_data - T.log( T.sum(T.exp(predict_data), axis=-1, keepdims=True)) inputs = [input_data, input_cond, input_mask] predict_fn = theano.function(inputs=inputs, outputs=[predict_data]) return predict_fn if __name__ == '__main__': parser = get_arg_parser() args = parser.parse_args() print(args, file=sys.stderr) input_data = T.ftensor3('input_data') input_cond = T.ftensor3('input_cond') input_mask = T.fmatrix('input_mask') network = deep_projection_ivector_ln_model_fix( input_var=input_data, cond_var=input_cond, mask_var=input_mask, num_inputs=input_dim, num_outputs=output_dim, num_conds=args.num_conds, num_layers=args.num_layers, num_factors=args.num_factors, num_units=args.num_units, grad_clipping=args.grad_clipping, dropout=args.dropout)[0] network_params = get_all_params(network, trainable=True)
def _build_network(self, load_params: bool = False): """Build network, including inputs, weights and the whole structure.""" # Tweet variables self.tweet_input = T.itensor3() self.targets_input = T.ivector() self.t_mask_input = T.fmatrix() self.params = t2v.init_params(n_chars=self.n_char) # classification params self.params["W_cl"] = theano.shared( np.random.normal( loc=0., scale=settings_char.SCALE, size=(settings_char.WDIM, self.n_classes), ).astype("float32"), name="W_cl", ) self.params["b_cl"] = theano.shared(np.zeros( (self.n_classes, )).astype("float32"), name="b_cl") if load_params: self._load_weights() # network for prediction predictions, net, embeddings = self._classify( self.tweet_input, self.t_mask_input, self.params, self.n_classes, self.n_char, ) # Theano function self._print("Compiling theano functions...") self.predict = theano.function([self.tweet_input, self.t_mask_input], predictions) self.encode = theano.function([self.tweet_input, self.t_mask_input], embeddings) self.net = net self._print("Building network...") # batch loss loss = lasagne.objectives.categorical_crossentropy( predictions, self.targets_input) cost = T.mean( loss ) + settings_char.REGULARIZATION * lasagne.regularization.regularize_network_params( self.net, lasagne.regularization.l2) cost_only = T.mean(loss) # params and updates self._print("Computing updates...") lr = settings_char.LEARNING_RATE mu = settings_char.MOMENTUM updates = lasagne.updates.nesterov_momentum( cost, lasagne.layers.get_all_params(self.net), lr, momentum=mu) # Theano function self._print("Compiling theano functions...") inps = [self.tweet_input, self.t_mask_input, self.targets_input] self.cost_val = theano.function(inps, [cost_only, embeddings]) self.train = theano.function(inps, cost, updates=updates)
import time import theano import numpy as np from theano import tensor as T from theano.tensor import tanh import mkl_simplernn_bw_op from mkl_simplernn_bw_op import SimpleRNN_bw X = T.ftensor3('X') W_x = T.fmatrix('W_x') W_h = T.fmatrix('W_h') B = T.fvector('B') B_mkl = T.fmatrix('B_mkl') hid = T.fmatrix('hid') o_real = T.ftensor3('o_real') def step(x, h_tm1): global W_h, B h_t = tanh(x + T.dot(h_tm1, W_h) + B) return h_t def SimpleRNN_theano(): global X, W_x, hid X_r = T.dot(X, W_x) fn = lambda x_r, h_tm1: step(x_r, h_tm1) result, updates = theano.scan(fn, sequences=[ X_r], outputs_info=hid, name='test_theano_gru_scan') return result if __name__ == '__main__':
def __init__( self, Nlayers=1, # number of layers Ndirs=1, # unidirectional or bidirectional Nx=100, # input size Nh=100, # hidden layer size Ny=100, # output size Ah='relu', # hidden unit activation (e.g. relu, tanh, lstm) Ay='linear', # output unit activation (e.g. linear, sigmoid, softmax) predictPer='frame', # frame or sequence loss=None, # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge) L1reg=0.0, # L1 regularization L2reg=0.0, # L2 regularization momentum=0.0, # SGD momentum seed=15213, # random seed for initializing the weights frontEnd=None, # a lambda function for transforming the input filename=None, # initialize from file initParams=None, # initialize from given dict ): if filename is not None: # load parameters from file with open(filename, 'rb') as f: initParams = cPickle.load(f) if initParams is not None: # load parameters from given dict self.paramNames = [] self.params = [] for k, v in initParams.iteritems(): if type(v) is numpy.ndarray: self.addParam(k, v) else: setattr(self, k, v) self.paramNames.append(k) else: # Initialize parameters randomly # Names of parameters to save to file self.paramNames = [ 'Nlayers', 'Ndirs', 'Nx', 'Nh', 'Ny', 'Ah', 'Ay', 'predictPer', 'loss', 'L1reg', 'L2reg', 'momentum', 'frontEnd' ] for name in self.paramNames: value = locals()[name] if isinstance(value, basestring): value = value.lower() locals()[name] = value setattr(self, name, value) # Values of parameters for building the computational graph self.params = [] # Initialize random number generators global rng rng = numpy.random.RandomState(seed) # Construct parameter matrices Nlstm = 4 if Ah == 'lstm' else 1 self.addParam('Win', rand_init((Nx, Nh * Ndirs * Nlstm), Ah)) self.addParam('Wrec', rand_init((Nlayers, Ndirs, Nh, Nh * Nlstm), Ah)) self.addParam( 'Wup', rand_init((Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm), Ah)) self.addParam('Wout', rand_init((Nh * Ndirs, Ny), Ay)) if Ah != 'lstm': self.addParam('Bhid', zeros((Nlayers, Nh * Ndirs))) else: self.addParam( 'Bhid', numpy.tile( numpy.hstack([ full((Nlayers, Nh), 1.0), zeros((Nlayers, Nh * 3)) ]), (1, Ndirs))) self.addParam('Bout', zeros(Ny)) self.addParam('h0', zeros((Nlayers, Ndirs, Nh))) if Ah == 'lstm': self.addParam('c0', zeros((Nlayers, Ndirs, Nh))) # Compute total number of parameters self.nParams = sum(x.get_value().size for x in self.params) # Initialize gradient tensors when using momentum if momentum > 0: self.dparams = [ theano.shared(zeros(x.get_value().shape)) for x in self.params ] # Build computation graph input = T.ftensor3() mask = T.imatrix() mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()] mask_float = [ T.cast((mask & 1).xdimshuffle((1, 0)).reshape( (mask.shape[1], mask.shape[0], 1)), theano.config.floatX), T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape( (mask.shape[1], mask.shape[0], 1)), theano.config.floatX) ] def step_rnn(x_t, mask, h_tm1, W, h0): h_tm1 = T.switch(mask, h0, h_tm1) return [ACTIVATION[Ah](x_t + h_tm1.dot(W))] def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0): c_tm1 = T.switch(mask, c0, c_tm1) h_tm1 = T.switch(mask, h0, h_tm1) a = x_t + h_tm1.dot(W) f_t = T.nnet.sigmoid(a[:, :Nh]) i_t = T.nnet.sigmoid(a[:, Nh:Nh * 2]) o_t = T.nnet.sigmoid(a[:, Nh * 2:Nh * 3]) c_t = T.tanh(a[:, Nh * 3:]) * i_t + c_tm1 * f_t h_t = T.tanh(c_t) * o_t return [c_t, h_t] x = input if frontEnd is None else frontEnd(input) for i in range(Nlayers): h = (x.dimshuffle((1, 0, 2)).dot(self.Win) if i == 0 else h.dot(self.Wup[i - 1])) + self.Bhid[i] rep = lambda x: T.extra_ops.repeat( x.reshape((1, -1)), h.shape[1], axis=0) if Ah != 'lstm': h = T.concatenate([ theano.scan( fn=step_rnn, sequences=[ h[:, :, Nh * d:Nh * (d + 1)], mask_float[d] ], outputs_info=[rep(self.h0[i, d])], non_sequences=[self.Wrec[i, d], rep(self.h0[i, d])], go_backwards=(d == 1), )[0][::(1 if d == 0 else -1)] for d in range(Ndirs) ], axis=2) else: h = T.concatenate([ theano.scan( fn=step_lstm, sequences=[ h[:, :, Nh * 4 * d:Nh * 4 * (d + 1)], mask_float[d] ], outputs_info=[rep(self.c0[i, d]), rep(self.h0[i, d])], non_sequences=[ self.Wrec[i, d], rep(self.c0[i, d]), rep(self.h0[i, d]) ], go_backwards=(d == 1), )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs) ], axis=2) h = h.dimshuffle((1, 0, 2)) if predictPer == 'sequence': h = T.concatenate([ h[mask_int[1 - d]][:, Nh * d:Nh * (d + 1)] for d in range(Ndirs) ], axis=1) output = ACTIVATION[Ay](h.dot(self.Wout) + self.Bout) # Compute loss function if loss is None: loss = { 'linear': 'mse', 'sigmoid': 'ce', 'softmax': 'ce_group' }[self.Ay] if predictPer == 'sequence': label = T.fmatrix() y = output t = label elif predictPer == 'frame': label = T.ftensor3() indices = (mask >= 0).nonzero() y = output[indices] t = label[indices] cost = T.mean({ 'ce': -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis=1), 'ce_group': -T.log((y * t).sum(axis=1)), 'mse': T.mean((y - t)**2, axis=1), 'hinge': T.mean(relu(1 - y * (t * 2 - 1)), axis=1), 'squared_hinge': T.mean(relu(1 - y * (t * 2 - 1))**2, axis=1), }[loss]) # Add regularization cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg # Compute updates for network parameters updates = [] gradient = [] lrate = T.fscalar() if momentum > 0: for w, d, g in zip(self.params, self.dparams, T.grad(cost, self.params)): updates.append( (w, w + momentum * momentum * d - (1 + momentum) * lrate * g)) updates.append((d, momentum * d - lrate * g)) gradient.append(g) else: for w, g in zip(self.params, T.grad(cost, self.params)): updates.append((w, w - lrate * g)) gradient.append(g) # Create functions to be called from outside self.train = theano.function( inputs=[input, mask, label, lrate], outputs=[cost, y, gradient[5], h, t, h.dot(self.Wout), self.Wout], updates=updates, ) self.predict = theano.function(inputs=[input, mask], outputs=output)
def fit(self, X, Y, learning_rate=10e-1, mu=0.99, reg=1.0, activation=T.tanh, epochs=100, show_fig=False): D = X[0].shape[1] # X is of size N x T(n) x D K = len(set(Y.flatten())) N = len(Y) M = self.M self.f = activation # initial weights Wx = init_weight(D, M) Wh = init_weight(M, M) bh = np.zeros(M) h0 = np.zeros(M) Wo = init_weight(M, K) bo = np.zeros(K) # make them theano shared self.Wx = theano.shared(Wx) self.Wh = theano.shared(Wh) self.bh = theano.shared(bh) self.h0 = theano.shared(h0) self.Wo = theano.shared(Wo) self.bo = theano.shared(bo) self.params = [self.Wx, self.Wh, self.bh, self.h0, self.Wo, self.bo] thX = T.fmatrix('X') thY = T.ivector('Y') def recurrence(x_t, h_t1): # returns h(t), y(t) h_t = self.f(x_t.dot(self.Wx) + h_t1.dot(self.Wh) + self.bh) y_t = T.nnet.softmax(h_t.dot(self.Wo) + self.bo) return h_t, y_t [h, y], _ = theano.scan( fn=recurrence, outputs_info=[self.h0, None], sequences=thX, n_steps=thX.shape[0], ) py_x = y[:, 0, :] prediction = T.argmax(py_x, axis=1) cost = -T.mean(T.log(py_x[T.arange(thY.shape[0]), thY])) grads = T.grad(cost, self.params) dparams = [theano.shared(p.get_value()*0) for p in self.params] updates = [ (p, p + mu*dp - learning_rate*g) for p, dp, g in zip(self.params, dparams, grads) ] + [ (dp, mu*dp - learning_rate*g) for dp, g in zip(dparams, grads) ] self.predict_op = theano.function(inputs=[thX], outputs=prediction) self.train_op = theano.function( inputs=[thX, thY], outputs=[cost, prediction, y], updates=updates ) costs = [] for i in xrange(epochs): X, Y = shuffle(X, Y) n_correct = 0 cost = 0 for j in xrange(N): c, p, rout = self.train_op(X[j], Y[j]) # print "p:", p cost += c if p[-1] == Y[j,-1]: n_correct += 1 print "shape y:", rout.shape print "i:", i, "cost:", cost, "classification rate:", (float(n_correct)/N) costs.append(cost) if n_correct == N: break if show_fig: plt.plot(costs) plt.show()
def __init__(self, We_initial, char_embedd_table_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) trans = np.random.uniform( -0.01, 0.01, (params.num_labels + 1, params.num_labels + 1)).astype('float32') transition = theano.shared(trans) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() char_input_var = T.itensor3(name='char-inputs') l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We, name='word_embedding') else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters #_, sent_length, _ = incoming2.output_shape # dropout before cnn? if params.dropout: layer_char = lasagne.layers.DropoutLayer(layer_char, p=0.5) # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer( layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape print pool_size # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) if params.dropout: incoming = lasagne.layers.DropoutLayer(incoming, p=0.5) l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, grad_clipping=5.) l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, grad_clipping=5., backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) if params.dropout: concat = lasagne.layers.DropoutLayer(concat, p=0.5) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * hidden)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=params.num_labels, nonlinearity=lasagne.nonlinearities.linear) #bi_lstm_crf = CRFLayer(concat, params.num_labels, mask_input= l_mask_word) local_energy = lasagne.layers.get_output( l_local, { l_in_word: input_var, l_mask_word: mask_var, layer_char_input: char_input_var }) local_energy = local_energy.reshape((-1, length, params.num_labels)) local_energy = local_energy * mask_var[:, :, None] end_term = transition[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] local_energy_eval = lasagne.layers.get_output( l_local, { l_in_word: input_var, l_mask_word: mask_var, layer_char_input: char_input_var }, deterministic=True) local_energy_eval = local_energy_eval.reshape( (-1, length, params.num_labels)) local_energy_eval = local_energy_eval * mask_var[:, :, None] local_energy_eval = local_energy_eval + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] length_index = T.sum(mask_var, axis=1) loss_train = crf_loss0(local_energy, transition, target_var, mask_var).mean() #loss_train = T.dot(loss_train, length_index)/T.sum(length_index) #loss_train = crf_loss0(local_energy, transition, target_var, mask_var).mean() prediction, corr = crf_accuracy0(local_energy_eval, transition, target_var, mask_var) ##loss_train = crf_loss(energies_train, target_var, mask_var).mean() ##prediction, corr = crf_accuracy(energies_train, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(transition) print network_params self.network_params = network_params loss_train = loss_train + params.L2 * sum( lasagne.regularization.l2(x) for x in network_params) updates = lasagne.updates.sgd(loss_train, network_params, params.eta) updates = lasagne.updates.apply_momentum(updates, network_params, momentum=0.9) self.train_fn = theano.function([ input_var, char_input_var, target_var, mask_var, mask_var1, length ], loss_train, updates=updates, on_unused_input='ignore') self.eval_fn = theano.function([ input_var, char_input_var, target_var, mask_var, mask_var1, length ], [corr_train, num_tokens, prediction], on_unused_input='ignore')
from theano import tensor from dagbldr.datasets import load_digits from dagbldr.utils import convert_to_one_hot from dagbldr.nodes import binary_crossentropy, binary_entropy from dagbldr.nodes import categorical_crossentropy, abs_error from dagbldr.nodes import squared_error, gaussian_error, log_gaussian_error from dagbldr.nodes import masked_cost, gaussian_kl, gaussian_log_kl # Common between tests digits = load_digits() X = digits["data"].astype("float32") y = digits["target"] n_classes = len(set(y)) y = convert_to_one_hot(y, n_classes).astype("float32") X_sym = tensor.fmatrix() y_sym = tensor.fmatrix() def test_binary_crossentropy(): cost = binary_crossentropy(.99 * X_sym, X_sym) theano.function([X_sym], cost, mode="FAST_COMPILE") def test_binary_entropy(): cost = binary_entropy(X_sym) theano.function([X_sym], cost, mode="FAST_COMPILE") def test_categorical_crossentropy(): cost = categorical_crossentropy(.99 * y_sym + .001, y_sym)
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, emb_size=40, batch_size=50, describ_max_len=20, type_size=12, filter_size=[3, 5], maxSentLen=100, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len) label_sent = np.asarray(label_sent, dtype='int32') label_mask = np.asarray(label_mask, dtype=theano.config.floatX) train_sents = np.asarray(all_sentences[0], dtype='int32') train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX) train_labels = np.asarray(all_labels[0], dtype='int32') train_size = len(train_labels) dev_sents = np.asarray(all_sentences[1], dtype='int32') dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX) dev_labels = np.asarray(all_labels[1], dtype='int32') dev_size = len(dev_labels) ''' combine train and dev ''' train_sents = np.concatenate([train_sents, dev_sents], axis=0) train_masks = np.concatenate([train_masks, dev_masks], axis=0) train_labels = np.concatenate([train_labels, dev_labels], axis=0) train_size = train_size + dev_size test_sents = np.asarray(all_sentences[2], dtype='int32') test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX) test_labels = np.asarray(all_labels[2], dtype='int32') test_size = len(test_labels) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec', emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec' ], 40) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 des_id_matrix = T.imatrix() des_mask = T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) repeat_common_input = T.repeat( normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen) des_input = embeddings[des_id_matrix.flatten()].reshape( (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1) bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1), axis=2) #(tyope_size, emb_size) repeat_des_input = T.tile( normalize_tensor3_colwise(des_input), (batch_size, 1, 1)) #(batch_size*type_size, emb_size, maxsentlen) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_att_W, conv_att_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_att_W2, conv_att_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W_context2, conv_b_context2 = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) ACNN_para = [ conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2, conv_W_context2 ] # NN_para = multiCNN_para+ACNN_para conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size LR_input = T.concatenate([sent_embeddings, sent_embeddings2, bow_emb], axis=1) LR_input_size = hidden_size[0] * 2 + emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 12, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(layer_LR.before_softmax) #batch * 12 prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix) loss = -T.mean(T.log(prob_pos)) ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.output_sent_rep # (batch_size, hidden_size) LR_att_input = T.concatenate([gru_sent_embeddings, bow_emb], axis=1) LR_att_input_size = hidden_size[0] + emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_att_a = create_ensemble_para( rng, 12, LR_att_input_size) # the weight matrix hidden_size*2 LR_att_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_att_para = [U_att_a, LR_att_b] layer_att_LR = LogisticRegression( rng, input=LR_att_input, n_in=LR_att_input_size, n_out=12, W=U_att_a, b=LR_att_b ) #basically it is a multiplication between weight matrix and input feature vector att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax) #batch * 12 att_prob_pos = T.where(labels < 1, 1.0 - att_score_matrix, att_score_matrix) att_loss = -T.mean(T.log(att_prob_pos)) ''' ACNN ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W, b=conv_att_b, W_context=conv_W_context, b_context=conv_b_context) sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W2, b=conv_att_b2, W_context=conv_W_context2, b_context=conv_b_context2) sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l acnn_LR_input = T.concatenate( [sent_att_embeddings, sent_att_embeddings2, bow_emb], axis=1) acnn_LR_input_size = hidden_size[0] * 2 + emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative acnn_U_a = create_ensemble_para( rng, 12, acnn_LR_input_size) # the weight matrix hidden_size*2 acnn_LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class acnn_LR_para = [acnn_U_a, acnn_LR_b] acnn_layer_LR = LogisticRegression( rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b ) #basically it is a multiplication between weight matrix and input feature vector acnn_score_matrix = T.nnet.sigmoid( acnn_layer_LR.before_softmax) #batch * 12 acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix, acnn_score_matrix) acnn_loss = -T.mean(T.log(acnn_prob_pos)) ''' dataless cosine ''' cosine_scores = normalize_matrix_rowwise(bow_emb).dot( normalize_matrix_rowwise(bow_des).T) cosine_score_matrix = T.nnet.sigmoid( cosine_scores) #(batch_size, type_size) ''' dataless top-30 fine grained cosine ''' fine_grained_cosine = T.batched_dot( repeat_common_input.dimshuffle(0, 2, 1), repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len) fine_grained_cosine_to_matrix = fine_grained_cosine.reshape( (batch_size * type_size, maxSentLen * describ_max_len)) sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1) top_k_simi = sort_fine_grained_cosine_to_matrix[:, -30:] # (batch_size*type_size, 5) max_fine_grained_cosine = T.mean(top_k_simi, axis=1) top_k_cosine_scores = max_fine_grained_cosine.reshape( (batch_size, type_size)) top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores) params = multiCNN_para + LR_para + GRU_NN_para + LR_att_para + ACNN_para + acnn_LR_para # put all model parameters together cost = loss + att_loss + acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) ''' testing ''' ensemble_NN_scores = T.max(T.concatenate([ att_score_matrix.dimshuffle('x', 0, 1), score_matrix.dimshuffle('x', 0, 1), acnn_score_matrix.dimshuffle('x', 0, 1) ], axis=0), axis=0) # ''' # majority voting, does not work # ''' # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0) # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0) # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0) # binarize_conc = T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0) # sum_binarize_conc = T.sum(binarize_conc,axis=0) # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0) # ''' # sum up prob, works # ''' # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0) ''' sum up prob, works ''' ensemble_scores = 0.6 * ensemble_NN_scores + 0.4 * 0.5 * ( cosine_score_matrix + top_k_score_matrix) binarize_prob = T.where(ensemble_scores > 0.3, 1, 0) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function( [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, des_id_matrix, des_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] # max_acc_dev=0.0 max_meanf1_test = 0.0 max_weightf1_test = 0.0 train_indices = range(train_size) cost_i = 0.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], label_sent, label_mask) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() error_sum = 0.0 all_pred_labels = [] all_gold_labels = [] for test_batch_id in test_batch_start: # for each test batch pred_labels = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], label_sent, label_mask) gold_labels = test_labels[test_batch_id:test_batch_id + batch_size] # print 'pred_labels:', pred_labels # print 'gold_labels;', gold_labels all_pred_labels.append(pred_labels) all_gold_labels.append(gold_labels) all_pred_labels = np.concatenate(all_pred_labels) all_gold_labels = np.concatenate(all_gold_labels) test_mean_f1, test_weight_f1 = average_f1_two_array_by_col( all_pred_labels, all_gold_labels) if test_weight_f1 > max_weightf1_test: max_weightf1_test = test_weight_f1 if test_mean_f1 > max_meanf1_test: max_meanf1_test = test_mean_f1 print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, gen_fn_dcgan, disc_fn_dcgan, gen_params_dcgan, disc_params_dcgan, gen_fn_p2p, disc_fn_p2p, gen_params_p2p, disc_params_p2p, in_shp, latent_dim, is_a_grayscale, is_b_grayscale, alpha=100, opt=adam, opt_args={'learning_rate': theano.shared(floatX(1e-3))}, train_mode='both', reconstruction='l1', sampler=np.random.rand, lsgan=False, verbose=True): """ Two-stage DCGAN/pix2pix GAN. Given training data (pairs) in the form [A,B], the DCGAN maps from prior samples z -> A, and the pix2pix GAN synthesises B images from A images. gen_fn_dcgan: a function that returns the architecture (concretely, the last layer) of the DCGAN. This function should have the signature (latent_dim, is_a_grayscale, ...), where `latent_dim` is the latent dimension, `is_a_grayscale` denotes whether the 'A' image is grayscale or not, and ... denotes optional kwargs. gen_params_dcgan: kwargs to pass to `gen_fn_dcgan`. disc_fn_dcgan: discriminator for the DCGAN. This function should have the signature (in_shp, is_a_grayscale, ...) where `in_shp` denotes the width/height of the generated/real image. disc_params_dcgan: kwargs to pass to `disc_fn_dcgan`. gen_fn_p2p: a function that returns the p2p architecture. This function should have the signature (in_shp, is_a_grayscale, is_b_grayscale, ...). disc_fn_p2p: should have the signature (in_shp, is_a_grayscale, is_b_grayscale) as well. Since this function requires two inputs (the A and B image), it returns a dictionary instead of a Lasagne layer (see `discriminator` in architectures/p2p.py). in_shp: dimensions (width/height) of the A and B image. latent_dim: prior sampling dimension for the DCGAN. is_a_grayscale: is the A image grayscale? is_b_grayscale: is the B image grayscale? alpha: weight of the reconstruction loss for the pix2pix opt: Lasagne optimiser opt_args: kwargs for the optimiser train_mode: if 'both', train both dcgan and p2p at the same time. If 'p2p', train p2p only, if 'dcgan', train DCGAN only. reconstruction: if 'l1', use L1 reconstruction. If 'l2', use L2. sampler: random generator for sampling from the prior distribution. lsgan: use LSGAN formulation? (Generally more stable than regular GAN.) verbose: """ assert train_mode in ['dcgan', 'p2p', 'both'] self.is_a_grayscale = is_a_grayscale self.is_b_grayscale = is_b_grayscale self.latent_dim = latent_dim self.sampler = sampler self.in_shp = in_shp self.verbose = verbose self.train_mode = train_mode # get the networks for the dcgan network dcgan_gen = gen_fn_dcgan(latent_dim, is_a_grayscale, **gen_params_dcgan) dcgan_disc = disc_fn_dcgan(in_shp, is_a_grayscale, **disc_params_dcgan) # get the networks for the p2p network p2p_gen = gen_fn_p2p(in_shp, is_a_grayscale, is_b_grayscale, **gen_params_p2p) p2p_disc = disc_fn_p2p(in_shp, is_a_grayscale, is_b_grayscale, **disc_params_p2p) if verbose: print("p2p gen:") self._print_network(dcgan_gen) print("p2p disc:") self._print_network(dcgan_disc) print("p2p gen:") self._print_network(p2p_gen) print("p2p disc:") self._print_network(p2p_disc["out"]) Z = T.fmatrix('Z') # noise var X = T.tensor4('X') # A Y = T.tensor4('Y') # B # construct theano stuff for dcgan gen/disc dcgan = {'gen': dcgan_gen, 'disc': dcgan_disc} dcgan['gen_out'] = get_output(dcgan_gen, Z) # G(z) dcgan['gen_out_det'] = get_output(dcgan_gen, Z, deterministic=True) dcgan['disc_out_real'] = get_output(dcgan_disc, X) # D(x) dcgan['disc_out_fake'] = get_output(dcgan_disc, dcgan['gen_out']) # D(G(z)) # construct theano stuff for the p2p gen/disc p2p = {'gen': p2p_gen, 'disc': p2p_disc["out"]} p2p['disc_out_real'] = get_output(p2p_disc["out"], { p2p_disc["inputs"][0]: X, p2p_disc["inputs"][1]: Y }) # D(X,Y) p2p['gen_out'] = get_output(p2p_gen, X) p2p['gen_out_det'] = get_output(p2p_gen, X, deterministic=True) p2p['disc_out_fake'] = get_output(p2p_disc["out"], { p2p_disc["inputs"][0]: X, p2p_disc["inputs"][1]: p2p['gen_out'] }) # D(X, X_to_y(X)) if lsgan: adv_loss = squared_error else: adv_loss = binary_crossentropy # dcgan loss definitions gen_loss_dcgan = adv_loss(dcgan['disc_out_fake'], 1.).mean() disc_loss_dcgan = adv_loss(dcgan['disc_out_real'], 1.).mean() + adv_loss( dcgan['disc_out_fake'], 0.).mean() # p2p loss definitions gen_loss_p2p = adv_loss(p2p['disc_out_fake'], 1.).mean() assert reconstruction in ['l1', 'l2'] if reconstruction == 'l2': recon_loss = squared_error(p2p['gen_out'], Y).mean() else: recon_loss = T.abs_(p2p['gen_out'] - Y).mean() #if not reconstruction_only: gen_total_loss_p2p = gen_loss_p2p + alpha * recon_loss #else: # #log("GAN disabled, using only pixel-wise reconstruction loss...") # gen_total_loss_p2p = recon_loss disc_loss_p2p = adv_loss(p2p['disc_out_real'], 1.).mean() + adv_loss( p2p['disc_out_fake'], 0.).mean() # dcgan params gen_params_dcgan = get_all_params(dcgan_gen, trainable=True) disc_params_dcgan = get_all_params(dcgan_disc, trainable=True) # pix2pix params gen_params_p2p = get_all_params(p2p_gen, trainable=True) disc_params_p2p = get_all_params(p2p_disc["out"], trainable=True) # -------------------- if verbose: print("train_mode: %s" % train_mode) if train_mode == 'both': updates = opt(gen_loss_dcgan, gen_params_dcgan, **opt_args) # update dcgan generator updates.update(opt(disc_loss_dcgan, disc_params_dcgan, **opt_args)) # update dcgan discriminator updates.update(opt(gen_total_loss_p2p, gen_params_p2p, **opt_args)) # update p2p generator updates.update(opt(disc_loss_p2p, disc_params_p2p, **opt_args)) # update p2p discriminator elif train_mode == 'dcgan': updates = opt(gen_loss_dcgan, gen_params_dcgan, **opt_args) # update dcgan generator updates.update(opt(disc_loss_dcgan, disc_params_dcgan, **opt_args)) # update dcgan discriminator else: updates = opt(gen_total_loss_p2p, gen_params_p2p, **opt_args) # update p2p generator updates.update(opt(disc_loss_p2p, disc_params_p2p, **opt_args)) # update p2p discriminator train_fn = theano.function([Z, X, Y], [ gen_loss_dcgan, disc_loss_dcgan, gen_loss_p2p, recon_loss, disc_loss_p2p ], updates=updates, on_unused_input='warn') loss_fn = theano.function([Z, X, Y], [ gen_loss_dcgan, disc_loss_dcgan, gen_loss_p2p, recon_loss, disc_loss_p2p ], on_unused_input='warn') gen_fn = theano.function([X], p2p['gen_out']) gen_fn_det = theano.function([X], p2p['gen_out_det']) z_fn = theano.function([Z], dcgan['gen_out']) z_fn_det = theano.function([Z], dcgan['gen_out_det']) self.train_fn = train_fn self.loss_fn = loss_fn self.gen_fn = gen_fn self.gen_fn_det = gen_fn_det self.z_fn = z_fn self.z_fn_det = z_fn_det self.dcgan = dcgan self.p2p = p2p self.lr = opt_args['learning_rate'] self.train_keys = [ 'dcgan_gen', 'dcgan_disc', 'p2p_gen', 'p2p_recon', 'p2p_disc' ]
l2 = dropout(l2, p_drop_conv) #l3a = rectify(conv2d(T.cast(l2,'float64'), T.cast(w3,'float64'))) #l3b = pool_2d(l3a, (2, 2)) l3 = T.flatten(l2, outdim=2) #l3 = dropout(l3, p_drop_conv) l4 = rectify(T.dot(l3, w4)) l4 = dropout(l4, p_drop_hidden) pyx = softmax(T.dot(l4, w_o)) return l1, l2, l3, l4, pyx X = T.ftensor4() Y = T.fmatrix() V = T.fscalar() w = init_weights((32, 1, 3, 3)) w2 = init_weights((64, 32, 3, 3)) #w3 = init_weights((128, 64, 3, 3)) w4 = init_weights((64 * 6 * 6, 625)) w_o = init_weights((625, 10)) noise_l1, noise_l2, noise_l3, noise_l4, noise_py_x = model( X, w, w2, w4, 0.2, 0.5) l1, l2, l3, l4, py_x = model(X, w, w2, w4, 0., 0.) y_x = T.argmax(py_x, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y)) params = [w, w2, w4, w_o]
def evaluate_lenet5(learning_rate=0.008, n_epochs=2000, nkerns=[400], batch_size=1, window_width=3, maxSentLength=30, emb_size=300, hidden_size=[300,10], margin=0.5, L2_weight=0.0001, Div_reg=0.0001, norm_threshold=5.0, use_svm=False): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/'; rng = numpy.random.RandomState(23455) datasets, word2id=load_msr_corpus_20161229(rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength) vocab_size=len(word2id)+1 mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/' mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt') wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2] indices_train_r=indices_train[1::2] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2] indices_test_r=indices_test[1::2] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] train_size = len(indices_train_l) test_size = len(indices_test_l) train_batch_start=range(train_size) test_batch_start=range(test_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int32') # indices_train_r=T.cast(indices_train_r, 'int32') # indices_test_l=T.cast(indices_test_l, 'int32') # indices_test_r=T.cast(indices_test_r, 'int32') rand_values=random_value_normal((vocab_size, emb_size), theano.config.floatX, rng) # rand_values[0]=numpy.array(numpy.zeros(emb_size)) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init_new(rand_values, id2word, word2vec) embeddings=theano.shared(value=numpy.array(rand_values,dtype=theano.config.floatX), borrow=True)#theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.iscalar() x_index_l = T.imatrix() # now, x is the index matrix, must be integer x_index_r = T.imatrix() y = T.ivector() left_l=T.iscalar() right_l=T.iscalar() left_r=T.iscalar() right_r=T.iscalar() length_l=T.iscalar() length_r=T.iscalar() norm_length_l=T.fscalar() norm_length_r=T.fscalar() mts=T.fmatrix() wmf=T.fmatrix() # cost_tmp=T.fscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer0_l_output_maxpool = T.max(layer0_l.output_narrow_conv_out[:,:,:,left_l:], axis=3).reshape((1, nkerns[0])) layer0_r_output_maxpool = T.max(layer0_r.output_narrow_conv_out[:,:,:,left_r:], axis=3).reshape((1, nkerns[0])) layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) sum_uni_l=T.sum(layer0_l_input[:,:,:,left_l:], axis=3).reshape((1, emb_size)) norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input[:,:,:,left_r:], axis=3).reshape((1, emb_size)) norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) ''' linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1=EUCLID(sum_uni_l, sum_uni_r) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts HL_layer_1_input=T.concatenate([ # mts, eucli_1, #uni_cosine,norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, # uni_cosine, # sum_uni_l, # sum_uni_r, # sum_uni_l+sum_uni_r, 1.0/(1.0+EUCLID(layer0_l_output_maxpool, layer0_r_output_maxpool)), cosine(layer0_l_output_maxpool, layer0_r_output_maxpool), layer0_l_output_maxpool, layer0_r_output_maxpool, T.sqrt((layer0_l_output_maxpool-layer0_r_output_maxpool)**2+1e-10), layer1.output_eucli_to_simi, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, # layer1.output_cosine, layer1.output_vector_l, layer1.output_vector_r, T.sqrt((layer1.output_vector_l-layer1.output_vector_r)**2+1e-10), # len_l, len_r layer1.output_attentions # wmf, ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) HL_layer_1_input_with_extra=T.concatenate([#HL_layer_1_input, mts, len_l, len_r # wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) HL_layer_1_input_size=1+1+ 1+1+3* nkerns[0] +1+1+3*nkerns[0]+10*10 HL_layer_1_input_with_extra_size = HL_layer_1_input_size+15+2 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[1], activation=T.tanh) LR_layer_input=T.concatenate([HL_layer_2.output, HL_layer_1.output, HL_layer_1_input],axis=1) LR_layer_input_with_extra=T.concatenate([HL_layer_2.output, HL_layer_1_input_with_extra],axis=1)#HL_layer_1.output, LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=HL_layer_1_input_size+hidden_size[0]+hidden_size[1], n_out=2) # LR_layer_input=HL_layer_2.output # LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=hidden_size, n_out=2) # layer3=LogisticRegression(rng, input=layer3_input, n_in=15+1+1+2+3, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((LR_layer.W** 2).sum()+(HL_layer_2.W** 2).sum()+(HL_layer_1.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum() # diversify_reg= Diversify_Reg(LR_layer.W.T)+Diversify_Reg(HL_layer_2.W.T)+Diversify_Reg(HL_layer_1.W.T)+Diversify_Reg(conv_W_into_matrix) cost_this =debug_print(LR_layer.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=cost_this+L2_weight*L2_reg#+Div_reg*diversify_reg test_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [LR_layer.errors(y), LR_layer.y_pred, LR_layer_input_with_extra, y], on_unused_input='ignore',allow_input_downcast=True) params = LR_layer.params+ HL_layer_2.params+HL_layer_1.params+[conv_W, conv_b]+[embeddings]#+[embeddings]# + layer1.params accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): clipped_grad = T.clip(grad_i, -0.5, 0.5) acc = acc_i + T.sqr(clipped_grad) updates.append((param_i, param_i - learning_rate * clipped_grad / T.sqrt(acc+1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [cost,LR_layer.errors(y)], updates=updates, on_unused_input='ignore',allow_input_downcast=True) train_model_predict = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [cost_this,LR_layer.errors(y), LR_layer_input_with_extra, y],on_unused_input='ignore',allow_input_downcast=True) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is best_params = None best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() epoch = 0 done_looping = False max_acc=0.0 nn_max_acc=0.0 best_iter=0 cost_tmp=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data for index in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * train_size + minibatch_index +1 minibatch_index=minibatch_index+1 # if iter%update_freq != 0: # cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) # #print 'cost_ij: ', cost_ij # cost_tmp+=cost_ij # error_sum+=error_ij # else: cost_i, error_i= train_model(indices_train_l[index: index + batch_size], indices_train_r[index: index + batch_size], trainY[index: index + batch_size], trainLeftPad_l[index], trainRightPad_l[index], trainLeftPad_r[index], trainRightPad_r[index], trainLengths_l[index], trainLengths_r[index], normalized_train_length_l[index], normalized_train_length_r[index], mt_train[index: index + batch_size], wm_train[index: index + batch_size]) cost_tmp+=cost_i if iter < 6000 and iter %100 ==0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter) if iter >= 6000 and iter % 100 == 0: # if iter%100 ==0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter) test_losses=[] test_y=[] test_features=[] for index in test_batch_start: test_loss, pred_y, layer3_input, y=test_model(indices_test_l[index: index + batch_size], indices_test_r[index: index + batch_size], testY[index: index + batch_size], testLeftPad_l[index], testRightPad_l[index], testLeftPad_r[index], testRightPad_r[index], testLengths_l[index], testLengths_r[index], normalized_test_length_l[index], normalized_test_length_r[index], mt_test[index: index + batch_size], wm_test[index: index + batch_size]) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_acc = (1-test_score) * 100. if test_acc > nn_max_acc: nn_max_acc = test_acc print '\t\t\tepoch:', epoch, 'iter:', iter, 'current acc:', test_acc, 'nn_max_acc:', nn_max_acc #now, see the results of svm if use_svm: train_y=[] train_features=[] for index in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(indices_train_l[index: index + batch_size], indices_train_r[index: index + batch_size], trainY[index: index + batch_size], trainLeftPad_l[index], trainRightPad_l[index], trainLeftPad_r[index], trainRightPad_r[index], trainLengths_l[index], trainLengths_r[index], normalized_train_length_l[index], normalized_train_length_r[index], mt_train[index: index + batch_size], wm_train[index: index + batch_size]) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n') #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.predict(test_features) lr=LinearRegression().fit(train_features, train_y) results_lr=lr.predict(test_features) corr_count=0 corr_lr=0 test_size=len(test_y) for i in range(test_size): if results[i]==test_y[i]: corr_count+=1 if numpy.absolute(results_lr[i]-test_y[i])<0.5: corr_lr+=1 acc=corr_count*1.0/test_size acc_lr=corr_lr*1.0/test_size if acc > max_acc: max_acc=acc best_iter=iter if acc_lr> max_acc: max_acc=acc_lr best_iter=iter print '\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ', max_acc , ' at iter: ', best_iter if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
import numpy as np import pandas as pd from materials import iris_dataset # data iris_data = iris_dataset() iris_data = iris_data.reindex(np.random.permutation(iris_data.index)) iris_x = iris_data[iris_data.columns[:4]].as_matrix() iris_y = pd.get_dummies(iris_data[iris_data.columns[4]]).values input_dim = iris_x.shape[1] hidden_dim = 9 output_dim = iris_y.shape[1] # models X = T.fmatrix('x') Y = T.fmatrix('y') W_i = theano.shared(np.random.randn(input_dim, hidden_dim), name='W') b_i = theano.shared(np.zeros((hidden_dim, )), name='b') W_h = theano.shared(np.random.randn(hidden_dim, output_dim), name='W') b_h = theano.shared(np.zeros((output_dim, )), name='b') o_h = T.nnet.sigmoid(T.dot(X, W_i) + b_i) p_y_given_x = T.nnet.sigmoid(T.dot(o_h, W_h) + b_h) # 训练设置 params = [W_i, b_i, W_h, b_h] predict_func = theano.function(inputs=[X], outputs=p_y_given_x, allow_input_downcast=True)
dense_1 = DenseLayer(input_state, num_units = n_input, nonlinearity = tanh) dense_2 = DenseLayer(dense_1, num_units = n_input, nonlinearity = tanh) probs = DenseLayer(dense_2, num_units = n_output, nonlinearity = softmax) return probs X_state = T.fmatrix() X_action = T.bvector() X_reward = T.fvector() X_action_hot = to_one_hot(X_action, n_output) prob_values = policy_network(X_state) policy_ = get_output(prob_values) policy = theano.function(inputs = [X_state], outputs = policy_, allow_input_downcast = True) loss = categorical_crossentropy(policy_, X_action_hot) * X_reward loss = loss.mean()
def test_local_gpu_elemwise(): """ Test local_gpu_elemwise when there is a dtype upcastable to float32 """ a = tensor.bmatrix() b = tensor.fmatrix() c = tensor.fmatrix() a_v = (numpy.random.rand(4, 5) * 10).astype("int8") b_v = (numpy.random.rand(4, 5) * 10).astype("float32") c_v = (numpy.random.rand(4, 5) * 10).astype("float32") # Due to optimization order, this composite is created when all # the op are on the gpu. f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) # Now test with the composite already on the cpu before we move it # to the gpu a_s = theano.scalar.int8() b_s = theano.scalar.float32() c_s = theano.scalar.float32() out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s]) out_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) return # Not yet implemeted # Test multiple output a_s = theano.scalar.float32() a = tensor.fmatrix() from theano.scalar.basic import identity out_s = theano.scalar.Composite( [a_s, b_s, c_s], [identity(a_s), identity(c_s), identity(b_s)]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v) utt.assert_allclose(out[1], c_v) utt.assert_allclose(out[2], b_v) # Test multiple output out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1 assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v + b_v) utt.assert_allclose(out[1], a_v * c_v) # Test non-contiguous input c = gpuarray_shared_constructor(numpy.asarray(c_v, dtype='float32')) f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]), mode=mode_with_gpu) out = f(a_v, b_v) utt.assert_allclose(out[0], a_v[::2] + b_v[::2]) utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
def build(self, dropout, char_dim, char_lstm_dim, char_bidirect, word_dim, word_lstm_dim, word_bidirect, lr_method, pre_emb, crf, cap_dim, training=True, **kwargs ): """ Build the network. """ # Training parameters n_words = len(self.id_to_word) n_chars = len(self.id_to_char) n_tags = len(self.id_to_tag) # Number of capitalization features if cap_dim: n_cap = 4 # Network variables is_train = T.iscalar('is_train') word_ids = T.ivector(name='word_ids') alpha_mask = T.fmatrix(name='alpha_mask') char_for_ids = T.imatrix(name='char_for_ids') char_rev_ids = T.imatrix(name='char_rev_ids') char_pos_ids = T.ivector(name='char_pos_ids') tag_ids = T.ivector(name='tag_ids') if cap_dim: cap_ids = T.ivector(name='cap_ids') # Sentence length s_len = (word_ids if word_dim else char_pos_ids).shape[0] # Final input (all word features) input_dim = 0 inputs = [] # # Word inputs # if word_dim: input_dim += word_dim word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') word_input = word_layer.link(word_ids) inputs.append(word_input) # Initialize with pretrained embeddings if pre_emb and training: new_weights = word_layer.embeddings.get_value() print 'Loading pretrained embeddings from %s...' % pre_emb pretrained = {} emb_invalid = 0 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): line = line.rstrip().split() if len(line) == word_dim + 1: pretrained[line[0]] = np.array( [float(x) for x in line[1:]] ).astype(np.float32) else: emb_invalid += 1 if emb_invalid > 0: print 'WARNING: %i invalid lines' % emb_invalid c_found = 0 c_lower = 0 c_zeros = 0 # Lookup table initialization for i in xrange(n_words): word = self.id_to_word[i] if word in pretrained: new_weights[i] = pretrained[word] c_found += 1 elif word.lower() in pretrained: new_weights[i] = pretrained[word.lower()] c_lower += 1 elif re.sub('\d', '0', word.lower()) in pretrained: new_weights[i] = pretrained[ re.sub('\d', '0', word.lower()) ] c_zeros += 1 word_layer.embeddings.set_value(new_weights) print 'Loaded %i pretrained embeddings.' % len(pretrained) print ('%i / %i (%.4f%%) words have been initialized with ' 'pretrained embeddings.') % ( c_found + c_lower + c_zeros, n_words, 100. * (c_found + c_lower + c_zeros) / n_words ) print ('%i found directly, %i after lowercasing, ' '%i after lowercasing + zero.') % ( c_found, c_lower, c_zeros ) # # Chars inputs # if char_dim: input_dim += char_lstm_dim char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_for') char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, name='char_lstm_rev') char_lstm_for.link(char_layer.link(char_for_ids)) char_lstm_rev.link(char_layer.link(char_rev_ids)) char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[ T.arange(s_len), char_pos_ids ] inputs.append(char_for_output) if char_bidirect: inputs.append(char_rev_output) input_dim += char_lstm_dim # # Capitalization feature # if cap_dim: input_dim += cap_dim cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') inputs.append(cap_layer.link(cap_ids)) # Prepare final input if len(inputs) != 1: inputs = T.concatenate(inputs, axis=1) # # Dropout on final input # if dropout: dropout_layer = DropoutLayer(p=dropout) input_train = dropout_layer.link(inputs) input_test = (1 - dropout) * inputs inputs = T.switch(T.neq(is_train, 0), input_train, input_test) # LSTM for words word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_for') word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, name='word_lstm_rev') word_lstm_for.link(inputs) word_lstm_rev.link(inputs[::-1, :]) word_for_output = word_lstm_for.h word_rev_output = word_lstm_rev.h[::-1, :] if word_bidirect: final_output = T.concatenate( [word_for_output, word_rev_output], axis=1 ) tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, name='tanh_layer', activation='tanh') final_output = tanh_layer.link(final_output) else: final_output = word_for_output # Sentence to Named Entity tags - Score final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', activation=(None if crf else 'softmax')) tags_scores = final_layer.link(final_output) # No CRF if not crf: cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() # CRF else: transitions = shared((n_tags + 2, n_tags + 2), 'transitions') small = -1000 b_s = np.array([[small] * n_tags + [0, small]]).astype(np.float32) e_s = np.array([[small] * n_tags + [small, 0]]).astype(np.float32) observations = T.concatenate( [tags_scores, small * T.ones((s_len, 2))], axis=1 ) observations = T.concatenate( [b_s, observations, e_s], axis=0 ) # Score from tags real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() # Score from transitions b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) real_path_score += transitions[ padded_tags_ids[T.arange(s_len + 1)], padded_tags_ids[T.arange(s_len + 1) + 1] ].sum() all_paths_scores = forward(observations, transitions) cost = - (real_path_score - all_paths_scores) # Network parameters params = [] if word_dim: self.add_component(word_layer) params.extend(word_layer.params) if char_dim: self.add_component(char_layer) self.add_component(char_lstm_for) params.extend(char_layer.params) params.extend(char_lstm_for.params) if char_bidirect: self.add_component(char_lstm_rev) params.extend(char_lstm_rev.params) self.add_component(word_lstm_for) params.extend(word_lstm_for.params) if word_bidirect: self.add_component(word_lstm_rev) params.extend(word_lstm_rev.params) if cap_dim: self.add_component(cap_layer) params.extend(cap_layer.params) self.add_component(final_layer) params.extend(final_layer.params) if crf: self.add_component(transitions) params.append(transitions) if word_bidirect: self.add_component(tanh_layer) params.extend(tanh_layer.params) # Prepare train and eval inputs eval_inputs = [] if word_dim: eval_inputs.append(word_ids) if char_dim: eval_inputs.append(char_for_ids) if char_bidirect: eval_inputs.append(char_rev_ids) eval_inputs.append(char_pos_ids) if cap_dim: eval_inputs.append(cap_ids) train_inputs = eval_inputs + [tag_ids] conf_inputs = eval_inputs + [alpha_mask] # Parse optimization method parameters if "-" in lr_method: lr_method_name = lr_method[:lr_method.find('-')] lr_method_parameters = {} for x in lr_method[lr_method.find('-') + 1:].split('-'): split = x.split('_') assert len(split) == 2 lr_method_parameters[split[0]] = float(split[1]) else: lr_method_name = lr_method lr_method_parameters = {} # Compile training function print 'Compiling...' if training: updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) f_train = theano.function( inputs=train_inputs, outputs=cost, updates=updates, givens=({is_train: np.cast['int32'](1)} if dropout else {}) ) else: f_train = None # Compile evaluation function if not crf: f_eval = theano.function( inputs=eval_inputs, outputs=tags_scores, givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) else: f_eval = theano.function( inputs=eval_inputs, outputs=forward(observations, transitions, viterbi=True, return_alpha=False, return_best_sequence=True), givens=({is_train: np.cast['int32'](0)} if dropout else {}) ) f_conf = theano.function( inputs=conf_inputs, outputs=conf(observations, transitions, alpha_mask), givens=({is_train: np.cast['int32'](0)} if dropout else {}), on_unused_input='ignore' ) return f_train, f_eval, f_conf
if type_mod is "alexnet": dim_in = 9216 if type_mod is "vgg_16": dim_in = 25088 if type_mod is "vgg_19": dim_in = 9216 if type_mod is "googlenet": dim_in = 9216 faceset = "lfpw" fd_data = "../../inout/data/face/" + faceset + "_data/" path_valid = fd_data + type_mod + "valid.pkl" w, h = 50, 50 if type_mod is not None and type_mod is not "": w, h = dim_in, 1 input = T.tensor4("x_input") output = T.fmatrix("y_output") # Create mixed data nbr_sup, nbr_xx, nbr_yy = 676, 0, 0 id_data = type_mod + "ch_tr_" + str(nbr_sup) + '_' + str(nbr_xx) + '_' +\ str(nbr_yy) # List train chuncks l_ch_tr = [ fd_data + id_data + "_" + str(i) + ".pkl" for i in range(0, 1)] time_exp = DT.datetime.now().strftime('%m_%d_%Y_%H_%M_%s') fold_exp = "../../exps/" + faceset + "_deep_convaeIN_" + time_exp if not os.path.exists(fold_exp): os.makedirs(fold_exp) nbr_layers = 5 init_w_path = "../../inout/init_weights/deep_conv_ae_IN_" +\
grads = T.grad(cost=cost, wrt=params) updates = [] for p, g in zip(params, grads): updates.append([p, p - g * lr]) return updates def model(X, w_h, w_o): h = T.nnet.sigmoid(T.dot(X, w_h)) pyx = T.nnet.softmax(T.dot(h, w_o)) return pyx trX, teX, trY, teY = mnist(onehot=True) X = T.fmatrix() Y = T.fmatrix() w_h = init_weights((784, 625)) w_o = init_weights((625, 10)) py_x = model(X, w_h, w_o) y_x = T.argmax(py_x, axis=1) cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y)) params = [w_h, w_o] updates = sgd(cost, params) train = theano.function(inputs=[X, Y], outputs=cost, updates=updates,
def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): """ This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias We check that we loop when their is too much threads """ n_in = 1000 batch_size = 4097 n_out = 1250 if not isinstance(mode_with_gpu, theano.compile.DebugMode): n_in = 4098 n_out = 4099 y = T.lvector('y') b = T.fvector('b') # we precompute the dot with big shape before to allow the test of # GpuCrossentropySoftmax1HotWithBiasDx to don't fail with the error # (the launch timed out and was terminated) on GPU card not # powerful enough. We need the big shape to check for corner # case. dot_result = T.fmatrix('dot_result') # Seed numpy.random with config.unittests.rseed utt.seed_rng() xx = np.asarray(np.random.rand(batch_size, n_in), dtype=np.float32) yy = np.ones((batch_size, ), dtype='int32') b_values = np.zeros((n_out, ), dtype='float32') W_values = np.asarray(np.random.rand(n_in, n_out), dtype='float32') dot_value = np.asarray(np.dot(xx, W_values), dtype='float32') del W_values p_y_given_x = T.nnet.softmax(dot_result + b) y_pred = T.argmax(p_y_given_x, axis=-1) loss = -T.mean(T.log(p_y_given_x)[T.arange(y.shape[0]), y]) dW = T.grad(loss, dot_result) classify = theano.function(inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_without_gpu) classify_gpu = theano.function(inputs=[y, b, dot_result], outputs=[loss, y_pred, dW], mode=mode_with_gpu) assert any([ isinstance(node.op, T.nnet.CrossentropySoftmaxArgmax1HotWithBias) for node in classify.maker.fgraph.toposort() ]) assert any([ isinstance(node.op, GpuCrossentropySoftmaxArgmax1HotWithBias) for node in classify_gpu.maker.fgraph.toposort() ]) out = classify(yy, b_values, dot_value) gout = classify_gpu(yy, b_values, dot_value) assert len(out) == len(gout) == 3 utt.assert_allclose(out[0], gout[0]) utt.assert_allclose(out[2], gout[2], atol=3e-6) utt.assert_allclose(out[1], gout[1])
unsup_weight_var = T.scalar('unsup_weight') learning_rate_var = T.scalar('learning_rate') adam_beta1_var = T.scalar('adam_beta1') # #Left sdp length # left_sdp_length=T.imatrix('left_sdp_length') # #Sentences length # sen_length=T.imatrix('sen_length') #negative loss negative_loss_alpha=T.fvector("negative_loss_alpha") negative_loss_lamda=T.fscalar("negative_loss_lamda") #input attention entity and root input_root=T.fmatrix("input_root") input_e1=T.fmatrix("input_e1") input_e2=T.fmatrix("input_e2") epoch_att=T.iscalar("epoch_att") """ 2. Bulit GRU network ADAM """ gru_network,l_in,l_mask,l_gru_forward,l_split_cnn=model.bulit_gru(input_var,mask_var) #mask_train_input: where "1" is pass. where "0" isn't pass. mask_train_input=kbp_data.mask_train_input(training_label,num_labels=model.num_labels) # Create a loss expression for training, i.e., a scalar objective we want
def make_predict_next(net): out_prev = T.imatrix() rep_prev = T.fmatrix() rep = net.LM(rep_prev, net.Embed(out_prev)) out = softmax3d(net.Embed.unembed(net.ToTxt(rep))) return theano.function([rep_prev, out_prev], [last(rep), out])
if __name__ == '__main__': import os os.environ[ 'THEANO_FLAGS'] = "floatX=float32, mode=FAST_RUN, lib.cnmem=0, warn_float64='raise'" import numpy as np, time import theano from lasagne_ext.objectives import CTC_Logscale from theano import tensor from torch.autograd import Variable # from ctc import best_path_decode # np.random.seed(33) B = 10 C = 50 L = 10 T = 500 x1, x2, x3, x4, x5 = tensor.fmatrix(name='queryseq'), \ tensor.tensor3(dtype='float32', name='scorematrix'), \ tensor.fmatrix(name='queryseq_mask'),\ tensor.fmatrix(name='scorematrix_mask'), \ tensor.fscalar(name='blank_symbol') scorematrix = np.random.rand(T, C + 1, B).astype(np.float32) query = np.random.randint(0, C, (L, B)).astype(np.float32) query_mask = np.random.rand(L, B) > 0.1 sm_mask = np.random.rand(T, B) > 0.1 result = CTC_Logscale.cost(x1, x2, x3, x4, x5, align='pre') f2 = theano.function([x1, x2, x3, x4, x5], result, on_unused_input='warn') time2 = time.time() result = f2(query, scorematrix, query_mask.astype(np.float32),
def __init__(self, We_initial, char_embedd_table_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) We_inf = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden hidden_inf = params.hidden_inf input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() t_t = T.fscalar() Wyy0 = np.random.uniform( -0.02, 0.02, (params.num_labels + 1, params.num_labels)).astype('float32') Wyy = theano.shared(Wyy0) char_input_var = T.itensor3() char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) char_embedd_table_inf = theano.shared(char_embedd_table_initial) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer( layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. l_emb_word = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * hidden)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=params.num_labels, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open( 'ccctag_BiLSTM_CNN_CRF_num_filters_30_dropout_1_LearningRate_0.01_0.0_400_emb_1_tagversoin_2.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) l_in_word_a = lasagne.layers.InputLayer((None, None)) l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word_a = lasagne.layers.EmbeddingLayer( l_in_word_a, input_size=We_initial.shape[0], output_size=embsize, W=We_inf, name='inf_word_embedding') layer_char_input_a = lasagne.layers.InputLayer( shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char_a = lasagne.layers.reshape(layer_char_input_a, (-1, [2])) layer_char_embedding_a = lasagne.layers.EmbeddingLayer( layer_char_a, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table_inf, name='char_embedding') layer_char_a = lasagne.layers.DimshuffleLayer(layer_char_embedding_a, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters #_, sent_length, _ = incoming2.output_shape # dropout before cnn? if params.dropout: layer_char_a = lasagne.layers.DropoutLayer(layer_char_a, p=0.5) # construct convolution layer cnn_layer_a = lasagne.layers.Conv1DLayer( layer_char_a, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) #_, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer_a = lasagne.layers.MaxPool1DLayer(cnn_layer_a, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer_a = lasagne.layers.reshape(pool_layer_a, (-1, length, [1])) # finally, concatenate the two incoming layers together. l_emb_word_a = lasagne.layers.concat( [output_cnn_layer_a, l_emb_word_a], axis=2) if params.dropout: l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5) if (params.inf == 0): l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden_inf, mask_input=l_mask_word_a) l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden_inf, mask_input=l_mask_word_a, backwards=True) l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a, (-1, hidden_inf)) l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a, (-1, hidden_inf)) concat2_a = lasagne.layers.ConcatLayer( [l_reshapef_a, l_reshapeb_a]) else: """ ### unigram l_cnn_input_a = lasagne.layers.DimshuffleLayer(l_emb_word_a, (0, 2, 1)) #l_cnn_1_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden, 3, 1, pad = 'same') #l_cnn_3_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden, 1, 1, pad = 'same') #l_cnn_a = lasagne.layers.ConcatLayer([l_cnn_1_a, l_cnn_3_a], axis=1) l_cnn_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden, 1, 1, pad = 'same') concat2_a = lasagne.layers.DimshuffleLayer(l_cnn_a, (0, 2, 1)) #concat2_a = lasagne.layers.ConcatLayer([l_emb_word, concat2], axis =2) concat2_a = lasagne.layers.ReshapeLayer(concat2_a ,(-1, hidden)) """ """ #### unigram + trigram l_cnn_input_a = lasagne.layers.DimshuffleLayer(l_emb_word_a, (0, 2, 1)) l_cnn_1_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden, 3, 1, pad = 'same') l_cnn_3_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden, 1, 1, pad = 'same') l_cnn_a = lasagne.layers.ConcatLayer([l_cnn_1_a, l_cnn_3_a], axis=1) concat2_a = lasagne.layers.DimshuffleLayer(l_cnn_a, (0, 2, 1)) concat2_a = lasagne.layers.ReshapeLayer(concat2_a ,(-1, 2*hidden)) """ #### unigram + 5-gram l_cnn_input_a = lasagne.layers.DimshuffleLayer( l_emb_word_a, (0, 2, 1)) l_cnn_1_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden_inf, 3, 1, pad='same') l_cnn_3_a = lasagne.layers.Conv1DLayer(l_cnn_input_a, hidden_inf, 1, 1, pad='same') l_cnn_a = lasagne.layers.ConcatLayer([l_cnn_1_a, l_cnn_3_a], axis=1) concat2_a = lasagne.layers.DimshuffleLayer(l_cnn_a, (0, 2, 1)) concat2_a = lasagne.layers.ReshapeLayer(concat2_a, (-1, 2 * hidden_inf)) if params.dropout: concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5) l_local_a = lasagne.layers.DenseLayer( concat2_a, num_units=params.num_labels, nonlinearity=lasagne.nonlinearities.softmax) a_params = lasagne.layers.get_all_params(l_local_a, trainable=True) self.a_params = a_params def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy_t = tg_energy + T.sum( new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output( l_local, { l_in_word: input_var, l_mask_word: mask_var, layer_char_input_a: char_input_var }) local_energy = local_energy.reshape((-1, length, params.num_labels)) local_energy = local_energy * mask_var[:, :, None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] predy0 = lasagne.layers.get_output( l_local_a, { l_in_word_a: input_var, l_mask_word_a: mask_var, layer_char_input_a: char_input_var }) predy_inf = lasagne.layers.get_output( l_local_a, { l_in_word_a: input_var, l_mask_word_a: mask_var, layer_char_input_a: char_input_var }, deterministic=True) predy_inf = predy_inf.reshape((-1, length, params.num_labels)) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, params.num_labels) A = A.reshape((-1, length, params.num_labels)) predy = predy0.reshape((-1, length, params.num_labels)) predy = predy * mask_var[:, :, None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * mask_var, axis=1) # compute the ground-truth energy targets_shuffled0 = A.dimshuffle(1, 0, 2) target_time00 = targets_shuffled0[0] initial_energy00 = T.dot(target_time00, Wyy[-1, :-1]) initials0 = [target_time00, initial_energy00] [_, target_energies0], _ = theano.scan( fn=inner_function, outputs_info=initials0, sequences=[targets_shuffled0[1:], masks_shuffled[1:]]) cost110 = target_energies0[-1] + T.sum( T.sum(local_energy * A, axis=2) * mask_var, axis=1) predy_f = predy.reshape((-1, params.num_labels)) y_f = target_var.flatten() if (params.annealing == 0): lamb = params.L3 elif (params.annealing == 1): lamb = params.L3 * (1 - 0.01 * t_t) if (params.regutype == 0): ce_hinge = lasagne.objectives.categorical_crossentropy( predy_f + eps, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * mask_var, axis=1) cost = T.mean(-cost11) + lamb * T.mean(ce_hinge) else: entropy_term = -T.sum(predy_f * T.log(predy_f + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * mask_var, axis=1) cost = T.mean(-cost11) - lamb * T.mean(entropy_term) #from adam import adam #updates_a = adam(cost, a_params, params.eta) updates_a = lasagne.updates.sgd(cost, a_params, params.eta) updates_a = lasagne.updates.apply_momentum(updates_a, a_params, momentum=0.9) if (params.regutype == 0): self.train_fn = theano.function([ input_var, char_input_var, target_var, mask_var, mask_var1, length, t_t ], [cost, ce_hinge], updates=updates_a, on_unused_input='ignore') else: self.train_fn = theano.function([ input_var, char_input_var, target_var, mask_var, mask_var1, length, t_t ], [cost, entropy_term], updates=updates_a, on_unused_input='ignore') prediction = T.argmax(predy_inf, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function([ input_var, char_input_var, target_var, mask_var, mask_var1, length ], [corr_train, num_tokens, prediction], on_unused_input='ignore')
def test_theano_grad(self): class AttentionLayer(object): def __init__(self, u, mask=None): self.u = theano.shared(value=u) self.mask = mask def get_output_expr(self, input_expr): input_expr = input_expr.dimshuffle(0, 2, 1) pre_a = T.dot(input_expr, self.u)[:, :, 0] if self.mask: pre_a = self.mask * pre_a - \ (1 - self.mask) * 3.402823466e+38 a = T.nnet.softmax(pre_a)[:, :, np.newaxis] return T.sum(a * input_expr, axis=1) class LogisticRegressionLayer(object): def __init__(self, W, b): self.W = theano.shared(value=W) if b is not None: self.b = theano.shared(value=b[0]) def get_output_expr(self, input_expr): if hasattr(self, 'b'): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) else: return T.nnet.sigmoid(T.dot(input_expr, self.W)) r = [] for i in xrange(self.N): batch_size = self.rng.random_integers(500) x_dim = self.rng.random_integers(3000) n_ts = self.rng.random_integers(100) x = [ self.rng.rand(batch_size, x_dim).astype(np.float32) for _ in xrange(n_ts) ] u = self.get_orthogonal_matrix(x_dim, 1) lr_dot_W = self.get_orthogonal_matrix(x_dim, 1) lr_dot_b = self.rng.rand(1, 1).astype( np.float32) if self.rng.randint(2) else None true_labels = self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) mask = self.rng.randint(2, size=(batch_size, n_ts)).astype( np.float32) if self.rng.randint(2) else None device_id = 0 # Theano model state = self.rng.get_state() th_x = T.ftensor3() th_mask = T.fmatrix() if mask is not None else None th_true_labels = T.fmatrix() attnt_layer = AttentionLayer(u, th_mask) lr_layer = LogisticRegressionLayer(lr_dot_W, lr_dot_b) probs = th_x for layer in [attnt_layer, lr_layer]: probs = layer.get_output_expr(probs) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) params = [lr_layer.W, attnt_layer.u, th_x] if hasattr(lr_layer, 'b'): params.append(lr_layer.b) th_grads = T.grad(loss, wrt=params) get_theano_grads = theano.function( [th_x, th_true_labels] + ([th_mask] if mask is not None else []), th_grads) th_grads = get_theano_grads( *([np.dstack(x), true_labels] + ([mask] if mask is not None else []))) # quagga model self.rng.set_state(state) x = List([Connector(Matrix.from_npa(e), device_id) for e in x]) u = Connector(Matrix.from_npa(u), device_id) lr_dot_W = Connector(Matrix.from_npa(lr_dot_W), device_id) lr_dot_b = Connector( Matrix.from_npa(lr_dot_b), device_id) if lr_dot_b is not None else lr_dot_b true_labels = Connector(Matrix.from_npa(true_labels)) if mask is not None: mask = Connector(Matrix.from_npa(mask)) attnt_block = AttentionBlock(x, u, mask) lrdot_block = DotBlock(lr_dot_W, lr_dot_b, attnt_block.output) sce_block = SigmoidCeBlock(lrdot_block.output, true_labels) x.fprop() true_labels.fprop() u.fprop() lr_dot_W.fprop() if lr_dot_b: lr_dot_b.fprop() attnt_block.fprop() lrdot_block.fprop() sce_block.fprop() sce_block.bprop() lrdot_block.bprop() attnt_block.bprop() q_grads = [ lr_dot_W.backward_matrix.to_host(), u.backward_matrix.to_host(), np.dstack([e.backward_matrix.to_host() for e in x]) ] if lr_dot_b: q_grads.append(lr_dot_b.backward_matrix.to_host()) for th_grad, q_grad in izip(th_grads, q_grads): r.append(np.allclose(th_grad, q_grad, atol=1.e-7)) print r[-1] self.assertEqual(sum(r), len(r))
def train(): global logfile_path global trainfile global train0file global test1file batch_size = int(256) embedding_size = 300 learning_rate = 0.005 n_epochs = 20000 words_num_dim = 1200 validation_freq = 10 filter_sizes = [1, 2, 3, 5] num_filters = 500 margin_size = 0.05 logfile_path = os.path.join(logfile_path, 'LSTM-' + GetNowTime() + '-' \ + 'batch_size-' + str(batch_size) + '-' \ + 'num_filters-' + str(num_filters) + '-' \ + 'embedding_size-' + str(embedding_size) + '-' \ + 'n_epochs-' + str(n_epochs) + '-' \ + 'freq-' + str(validation_freq) + '-' \ + '-log.txt') log("New start ...", logfile_path) log(str(time.asctime(time.localtime(time.time()))), logfile_path) log("batch_size = " + str(batch_size), logfile_path) log("filter_sizes = " + str(filter_sizes), logfile_path) log("num_filters = " + str(num_filters), logfile_path) log("embedding_size = " + str(embedding_size), logfile_path) log("learning_rate = " + str(learning_rate), logfile_path) log("words_num_dim = " + str(words_num_dim), logfile_path) log("n_epochs = " + str(n_epochs), logfile_path) log("margin_size = " + str(margin_size), logfile_path) log("validation_freq = " + str(validation_freq), logfile_path) log("train_1_file = " + str(trainfile.split('/')[-1]), logfile_path) log("train_0_file = " + str(train0file.split('/')[-1]), logfile_path) log("test_file = " + str(test1file.split('/')[-1]), logfile_path) log("vector_file = " + str(vectorsfile.split('/')[-1]), logfile_path) vocab = build_vocab() word_embeddings = load_word_embeddings(vocab, embedding_size) trainList = load_train_list() testList = load_test_list() train0Dict = load_train0_dict() train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_train_data_from_2files(train0Dict, trainList, vocab, batch_size, words_num_dim) x1, x2, x3 = T.fmatrix('x1'), T.fmatrix('x2'), T.fmatrix('x3') m1, m2, m3 = T.fmatrix('m1'), T.fmatrix('m2'), T.fmatrix('m3') model = LSTM( input1=x1, input2=x2, input3=x3, mask1=m1, mask2=m2, mask3=m3, word_embeddings=word_embeddings, batch_size=batch_size, sequence_len=train_x1.shape[0], #row is sequence_len embedding_size=embedding_size, filter_sizes=filter_sizes, num_filters=num_filters, margin_size = margin_size) cost, cos12, cos13 = model.cost, model.cos12, model.cos13 params, accuracy = model.params, model.accuracy grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] p1, p2, p3 = T.fmatrix('p1'), T.fmatrix('p2'), T.fmatrix('p3') q1, q2, q3 = T.fmatrix('q1'), T.fmatrix('q2'), T.fmatrix('q3') train_model = theano.function( [p1, p2, p3, q1, q2, q3], [cost, accuracy], updates=updates, givens={ x1: p1, x2: p2, x3: p3, m1: q1, m2: q2, m3: q3 } ) v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3') u1, u2, u3 = T.matrix('u1'), T.matrix('u2'), T.matrix('u3') validate_model = theano.function( inputs=[v1, v2, v3, u1, u2, u3], outputs=[cos12, cos13], #updates=updates, givens={ x1: v1, x2: v2, x3: v3, m1: u1, m2: u2, m3: u3 } ) epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch += 1 train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_train_data_from_2files(train0Dict, trainList, vocab, batch_size, words_num_dim) #print('train_x1, train_x2, train_x3') #print(train_x1.shape, train_x2.shape, train_x3.shape) cost_ij, acc = train_model(train_x1, train_x2, train_x3, mask1, mask2, mask3) log('load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc), logfile_path) if epoch % validation_freq == 0: log('Evaluation ......', logfile_path) validation(validate_model, testList, vocab, batch_size, words_num_dim)
def _prepare_networks(self, n_items): ''' Prepares the building blocks of the RNN, but does not compile them: self.l_in : input layer self.l_mask : mask of the input layer self.target : target of the network self.l_out : output of the network self.cost : cost function ''' self.n_items = n_items # Theano tensor for the targets input_var = theano.sparse.csr_matrix('input_var') self.target = T.ivector('target_output') self.exclude = T.fmatrix('excluded_items') self.samples = T.ivector('samples') self.cluster_samples = T.ivector('cluster_samples') # The input is composed of to parts : the on-hot encoding of the movie, and the features of the movie self.l_in = lasagne.layers.InputLayer(shape=(self.batch_size, self.n_items), input_var=input_var) l_user_rep = SparseLayer(self.l_in, num_units=self.n_hidden, nonlinearity=None, b=None) self.user_representation_layer = l_user_rep # The sliced output is then passed through linear layer to obtain the right output size self.l_out = BlackoutLayer(l_user_rep, num_units=self.n_items, num_outputs=self.n_samples, nonlinearity=None, W=lasagne.init.GlorotUniform()) # lasagne.layers.get_output produces a variable for the output of the net network_output = lasagne.layers.get_output(self.l_out, targets=self.target, samples=self.samples) # loss function self.cost = self._loss(network_output, self.batch_size).mean() if self.reg > 0.: self.cost += self.reg * lasagne.regularization.regularize_network_params( self.l_out, lasagne.regularization.l2) elif self.reg < 0.: self.cost -= self.reg * lasagne.regularization.regularize_network_params( self.l_out, lasagne.regularization.l1) # Cluster learning self.T_scale = theano.shared(self.effective_scale) scaled_softmax = lambda x: lasagne.nonlinearities.softmax(x * self. T_scale) self.cluster_selection_layer = lasagne.layers.DenseLayer( l_user_rep, b=None, num_units=self.n_clusters, nonlinearity=None) cluster_selection = lasagne.layers.get_output( self.cluster_selection_layer) if self.cluster_selection_noise > 0.: cluster_selection = cluster_selection + self._srng.normal( cluster_selection.shape, avg=0.0, std=self.cluster_selection_noise) cluster_selection = scaled_softmax(cluster_selection) self.cluster_repartition = theano.shared( (0.1 * np.random.randn(self.n_items, self.n_clusters)).astype( theano.config.floatX)) if self.cluster_type == 'softmax': target_and_samples_clusters = scaled_softmax( self.cluster_repartition[ T.concatenate([self.target, self.cluster_samples]), :]) elif self.cluster_type == 'mix': target_and_samples_clusters = scaled_softmax(self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) + \ T.nnet.sigmoid(self.T_scale*self.cluster_repartition[T.concatenate([self.target, self.cluster_samples]), :]) else: target_and_samples_clusters = T.nnet.sigmoid( self.T_scale * self.cluster_repartition[ T.concatenate([self.target, self.cluster_samples]), :]) cluster_score = cluster_selection.dot(target_and_samples_clusters.T) self.cost_clusters = self._loss(cluster_score, self.batch_size).mean()
def main(options): print 'Build and compile network' input_data = T.ftensor3('input_data') input_mask = T.fmatrix('input_mask') target_data = T.imatrix('target_data') target_mask = T.fmatrix('target_mask') network = build_network( input_data=input_data, input_mask=input_mask, num_inputs=options['num_inputs'], num_inner_units_list=options['num_inner_units_list'], num_factor_units_list=options['num_factor_units_list'], num_outer_units_list=options['num_outer_units_list'], num_outputs=options['num_outputs'], gating_nonlinearity=options['gating_nonlinearity'], dropout_ratio=options['dropout_ratio'], weight_noise=options['weight_noise'], use_layer_norm=options['use_layer_norm'], peepholes=options['peepholes'], learn_init=options['learn_init'], grad_clipping=options['grad_clipping']) network_params = get_all_params(network, trainable=True) if options['reload_model']: print('Loading Parameters...') pretrain_network_params_val, pretrain_update_params_val, pretrain_total_batch_cnt = pickle.load( open(options['reload_model'], 'rb')) print('Applying Parameters...') set_model_param_value(network_params, pretrain_network_params_val) else: pretrain_update_params_val = None pretrain_total_batch_cnt = 0 print 'Build network trainer' training_fn, trainer_params = set_network_trainer( input_data=input_data, input_mask=input_mask, target_data=target_data, target_mask=target_mask, num_outputs=options['num_outputs'], network=network, updater=options['updater'], learning_rate=options['lr'], grad_max_norm=options['grad_norm'], l2_lambda=options['l2_lambda'], load_updater_params=pretrain_update_params_val) print 'Build network predictor' predict_fn = set_network_predictor(input_data=input_data, input_mask=input_mask, target_data=target_data, target_mask=target_mask, num_outputs=options['num_outputs'], network=network) print 'Load data stream' train_datastream = get_datastream(path=options['data_path'], which_set='train_si84', batch_size=options['batch_size']) print 'Start training' if os.path.exists(options['save_path'] + '_eval_history.npz'): evaluation_history = numpy.load( options['save_path'] + '_eval_history.npz')['eval_history'].tolist() else: evaluation_history = [[[10.0, 10.0, 1.0], [10.0, 10.0, 1.0]]] early_stop_flag = False early_stop_cnt = 0 total_batch_cnt = 0 try: # for each epoch for e_idx in range(options['num_epochs']): # for each batch for b_idx, data in enumerate( train_datastream.get_epoch_iterator()): total_batch_cnt += 1 if pretrain_total_batch_cnt >= total_batch_cnt: continue # get input, target data input_data = data[0].astype(floatX) input_mask = data[1].astype(floatX) # get target data target_data = data[2] target_mask = data[3].astype(floatX) # get output train_output = training_fn(input_data, input_mask, target_data, target_mask) train_predict_cost = train_output[0] network_grads_norm = train_output[1] # show intermediate result if total_batch_cnt % options[ 'train_disp_freq'] == 0 and total_batch_cnt != 0: best_idx = numpy.asarray(evaluation_history)[:, 1, 2].argmin() print '============================================================================================' print 'Model Name: ', options['save_path'].split('/')[-1] print '============================================================================================' print 'Epoch: ', str(e_idx), ', Update: ', str( total_batch_cnt) print '--------------------------------------------------------------------------------------------' print 'Prediction Cost: ', str(train_predict_cost) print 'Gradient Norm: ', str(network_grads_norm) print '--------------------------------------------------------------------------------------------' print 'Train NLL: ', str( evaluation_history[-1][0][0]), ', BPC: ', str( evaluation_history[-1][0][1]), ', FER: ', str( evaluation_history[-1][0][2]) print 'Valid NLL: ', str( evaluation_history[-1][1][0]), ', BPC: ', str( evaluation_history[-1][1][1]), ', FER: ', str( evaluation_history[-1][1][2]) print '--------------------------------------------------------------------------------------------' print 'Best NLL: ', str( evaluation_history[best_idx][1][0]), ', BPC: ', str( evaluation_history[best_idx][1] [1]), ', FER: ', str( evaluation_history[best_idx][1][2]) # evaluation if total_batch_cnt % options[ 'train_eval_freq'] == 0 and total_batch_cnt != 0: train_eval_datastream = get_datastream( path=options['data_path'], which_set='train_si84', batch_size=options['eval_batch_size']) valid_eval_datastream = get_datastream( path=options['data_path'], which_set='test_dev93', batch_size=options['eval_batch_size']) train_nll, train_bpc, train_fer = network_evaluation( predict_fn, train_eval_datastream) valid_nll, valid_bpc, valid_fer = network_evaluation( predict_fn, valid_eval_datastream) # check over-fitting if valid_fer > numpy.asarray(evaluation_history)[:, 1, 2].min(): early_stop_cnt += 1. else: early_stop_cnt = 0. best_network_params_vals = get_model_param_values( network_params) pickle.dump( best_network_params_vals, open(options['save_path'] + '_best_model.pkl', 'wb')) if early_stop_cnt > 10: early_stop_flag = True break # save results evaluation_history.append( [[train_nll, train_bpc, train_fer], [valid_nll, valid_bpc, valid_fer]]) numpy.savez(options['save_path'] + '_eval_history', eval_history=evaluation_history) # save network if total_batch_cnt % options[ 'train_save_freq'] == 0 and total_batch_cnt != 0: cur_network_params_val = get_model_param_values( network_params) cur_trainer_params_val = get_update_params_values( trainer_params) cur_total_batch_cnt = total_batch_cnt pickle.dump([ cur_network_params_val, cur_trainer_params_val, cur_total_batch_cnt ], open(options['save_path'] + '_last_model.pkl', 'wb')) if early_stop_flag: break except KeyboardInterrupt: print 'Training Interrupted' cur_network_params_val = get_model_param_values(network_params) cur_trainer_params_val = get_update_params_values(trainer_params) cur_total_batch_cnt = total_batch_cnt pickle.dump([ cur_network_params_val, cur_trainer_params_val, cur_total_batch_cnt ], open(options['save_path'] + '_last_model.pkl', 'wb'))
def orig_model(filters_list, outdim, cost, input_dims = (1, 23, 23), activation="rectify", **kwargs): #Emean, Estd, max_mol_size, num_dist_basis, c_len, num_species, # num_interaction_passes, num_hidden_neurons, values_to_predict,cost): # path to targets_file is not NONE # sym_coulomb = T.imatrix() sym_coulomb = T.ftensor4() sym_y = T.fmatrix() sym_learn_rate = T.scalar() try: nonlinearity = getattr(lasagne.nonlinearities, activation) except AttributeError as e: print(e) raise RuntimeError("Activation {} missing in lasagne.nonlinearities.".format(activation)) # layer_input_dims = (None, *input_dims) # (None, 1, 23, 23) if input_dims == (1, 23, 23) layer_input_dims = [None] layer_input_dims.extend(input_dims) layers = [] layers.append(lasagne.layers.InputLayer(layer_input_dims, name="layer_input")) for idx, num_filters in enumerate(filters_list): layers.append(Conv2DLayer(layers[-1], num_filters = num_filters, filter_size = 3, pad = "same", flip_filters = False, nonlinearity = nonlinearity, name="layer_conv_{}_1".format(idx) )) layers.append(Conv2DLayer(layers[-1], num_filters = num_filters, filter_size = 3, pad = "same", flip_filters = False, nonlinearity = nonlinearity, name="layer_conv_{}_2".format(idx) )) layers.append(Conv2DLayer(layers[-1], num_filters = num_filters, filter_size = 3, pad = "same", flip_filters = False, nonlinearity = nonlinearity, name="layer_conv_{}_3".format(idx) )) layers.append(MaxPool2DLayer(layers[-1], pool_size = 2, name="layer_maxpool_1" )) layers.append(FlattenLayer(layers[-1])) layers.append(DenseLayer(layers[-1], num_units = outdim, nonlinearity=lasagne.nonlinearities.linear)) l_out = layers[-1] # l_in_Z = lasagne.layers.InputLayer((None, max_mol_size)) # l_in_D = lasagne.layers.InputLayer((None, max_mol_size, max_mol_size, num_dist_basis)) # l_mask = MaskLayer(l_in_Z) # l_c0 = SwitchLayer(l_in_Z, num_species, c_len, W=lasagne.init.Uniform(1.0/np.sqrt(c_len))) # l_cT = RecurrentLayer(l_c0, l_in_D, l_mask, num_passes=num_interaction_passes, num_hidden=num_hidden_neurons) # # Compute energy contribution from each atom # l_atom1 = lasagne.layers.DenseLayer(l_cT, 15, nonlinearity=lasagne.nonlinearities.tanh, num_leading_axes=2) # outdim (-1, 23, 15) # l_atom2 = lasagne.layers.DenseLayer(l_atom1, values_to_predict, nonlinearity=None, num_leading_axes=2) # outdim (-1, 23, values_to_predict) # l_atomE = lasagne.layers.ExpressionLayer(l_atom2, lambda x: (x*Estd+Emean)) # Scale and shift by mean and std deviation # l_mask = lasagne.layers.ReshapeLayer(l_mask, ([0], [1], 1)) # add an extra dimension so that l_atomE (-1, 23, 16) l_mask "after reshape" (-1, 23, 1) can be multiplied # l_out = SumMaskedLayer(l_atomE, l_mask) params = lasagne.layers.get_all_params(l_out, trainable=True) for p in params: logger.debug("%s, %s" % (p, p.get_value().shape)) # out_train = lasagne.layers.get_output(l_out, {l_in_Z: sym_Z, l_in_D: sym_D}, deterministic=False) # out_test = lasagne.layers.get_output(l_out, {l_in_Z: sym_Z, l_in_D: sym_D}, deterministic=True) out_train = lasagne.layers.get_output(l_out, {layers[0] : sym_coulomb}, deterministic=False) out_test = lasagne.layers.get_output(l_out, {layers[0] : sym_coulomb}, deterministic=True) if cost == "mae": cost_train = T.mean(np.abs(out_train-sym_y)) cost_test = T.mean(np.abs(out_test-sym_y)) logger.info("Used MAE cost") elif cost == "rmse": cost_train = T.mean(lasagne.objectives.squared_error(out_train, sym_y)) cost_test = T.mean(lasagne.objectives.squared_error(out_test, sym_y)) logger.info("Used MSE cost") else: raise ValueError("unknown cost function {}".format(cost)) updates = lasagne.updates.adam(cost_train, params, learning_rate=sym_learn_rate) f_train = theano.function( inputs = [sym_coulomb, sym_y, sym_learn_rate], outputs = cost_train, updates = updates ) f_eval_test = theano.function( inputs = [sym_coulomb], outputs = out_test ) f_test = theano.function( inputs = [sym_coulomb, sym_y], outputs = cost_test, ) # f_train = theano.function( # inputs = [sym_Z, sym_D, sym_y, sym_learn_rate], # outputs = cost_train, # updates = updates # ) # f_eval_test = theano.function( # inputs = [sym_Z, sym_D], # outputs = out_test # ) # f_test = theano.function( # inputs = [sym_Z, sym_D, sym_y], # outputs = cost_test, # ) return f_train, f_eval_test, f_test, l_out