def ADAM(classifier, cost, lr, updates): t = theano.shared(numpy.int(1)) alpha = lr beta_1 = 0.9 beta_2 = 0.999 epsilon = 1.0 * 10 ** -8.0 lam = 1.0 - 1.0 * 10 ** -8.0 g_model_params = [] models_m = [] models_v = [] for param in classifier.params: gparam = T.grad(cost, wrt=param) g_model_params.append(gparam) m = theano.shared(numpy.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX)) v = theano.shared(numpy.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX)) models_m.append(m) models_v.append(v) for param, gparam, m, v in zip(classifier.params, g_model_params, models_m, models_v): beta_1_t = T.cast(beta_1 * lam ** (t - 1), theano.config.floatX) updates[m] = beta_1_t * m + (numpy.float(1.0) - beta_1_t) * gparam updates[v] = beta_2 * v + (1 - beta_2) * (gparam * gparam) m_hat = updates[m] / (1.0 - T.cast(beta_1 ** t, theano.config.floatX)) v_hat = updates[v] / (1.0 - T.cast(beta_2 ** t, theano.config.floatX)) updates[param] = param - alpha * m_hat / (T.sqrt(v_hat) + epsilon) updates[t] = t + 1 return updates
def local_gpua_advanced_incsubtensor(node, context_name): # This is disabled on non-cuda contexts if get_context(context_name).kind != "cuda": return None x, y, ilist = node.inputs # Gpu Ops needs both inputs to have the same dtype if x.type.dtype != y.type.dtype: dtype = scalar.upcast(x.type.dtype, y.type.dtype) if x.type.dtype != dtype: x = tensor.cast(x, dtype) if y.type.dtype != dtype: y = tensor.cast(y, dtype) set_instead_of_inc = node.op.set_instead_of_inc active_device_no = theano.sandbox.cuda.active_device_number() device_properties = theano.sandbox.cuda.device_properties compute_capability = device_properties(active_device_no)["major"] if compute_capability < 2 or x.ndim != 2 or y.ndim != 2: return GpuAdvancedIncSubtensor1(set_instead_of_inc=set_instead_of_inc) else: return GpuAdvancedIncSubtensor1_dev20(set_instead_of_inc=set_instead_of_inc)
def _linspace(start, stop, num): # Theano linspace. Behaves similar to np.linspace start = T.cast(start, theano.config.floatX) stop = T.cast(stop, theano.config.floatX) num = T.cast(num, theano.config.floatX) step = (stop-start)/(num-1) return T.arange(num, dtype=theano.config.floatX)*step+start
def load_data(random_state=1066, n=1000, max_phrase_length=100): data = utils.load_data(random_state=random_state, n=n, max_phrase_length=max_phrase_length) X_train, y_train = data[0] X_valid, y_valid = data[1] X_test, y_test = data[2] X_train = X_train.reshape((-1, max_phrase_length, 67)).transpose(0, 2, 1) X_valid = X_valid.reshape((-1, max_phrase_length, 67)).transpose(0, 2, 1) X_test = X_test.reshape((-1, max_phrase_length, 67)).transpose(0, 2, 1) # Robert: what about reshaping this data for 1D convs? # hstack() instead of hstack() in when creatign X in utils? return dict( X_train=theano.shared(lasagne.utils.floatX(X_train)), y_train=T.cast(theano.shared(y_train), 'int32'), X_valid=theano.shared(lasagne.utils.floatX(X_valid)), y_valid=T.cast(theano.shared(y_valid), 'int32'), X_test=theano.shared(lasagne.utils.floatX(X_test)), y_test=T.cast(theano.shared(y_test), 'int32'), num_examples_train=X_train.shape[0], num_examples_valid=X_valid.shape[0], num_examples_test=X_test.shape[0], #input_height=X_train.shape[2], # what's the equivalent in our vectors? #input_width=X_train.shape[3], output_dim=5, # since five sentiment class )
def load_data(self): data = self._load_data() X_train, y_train = data[0] X_valid, y_valid = data[1] X_test, y_test = data[2] # reshape for convolutions X_train = X_train.reshape((X_train.shape[0], 1, 28, 28)) X_valid = X_valid.reshape((X_valid.shape[0], 1, 28, 28)) X_test = X_test.reshape((X_test.shape[0], 1, 28, 28)) return dict( X_train=theano.shared(lasagne.utils.floatX(X_train)), y_train=T.cast(theano.shared(y_train), 'int32'), X_valid=theano.shared(lasagne.utils.floatX(X_valid)), y_valid=T.cast(theano.shared(y_valid), 'int32'), valid_set = X_valid, y_valid_raw = y_valid, X_test=theano.shared(lasagne.utils.floatX(X_test)), y_test=T.cast(theano.shared(y_test), 'int32'), num_examples_train=X_train.shape[0], num_examples_valid=X_valid.shape[0], num_examples_test=X_test.shape[0], input_height=X_train.shape[2], input_width=X_train.shape[3], input_dim=[X_train.shape[2],X_train.shape[3]], output_dim=10, )
def get_multi_loss(self, out1, out2, y1_batch, y2_batch): # TODO needs downsample for y #loss1 = pydnn.expr2d.logloss_2d( out1, T.cast(y1_batch, 'int32') ) #loss2 = pydnn.expr2d.logloss_2d( out2, T.cast(y2_batch, 'int32') ) loss1 = pydnn.expr2d.masked_logloss_2d( out1, T.cast(y1_batch, 'int32') ) loss2 = pydnn.expr2d.masked_logloss_2d( out2, T.cast(y2_batch, 'int32') ) return loss1+loss2
def _transform_affine(theta, input, downsample_factor): num_batch, num_channels, height, width = input.shape theta = T.reshape(theta, (-1, 2, 3)) # grid of (x_t, y_t, 1), eq (1) in ref [1] out_height = T.cast(height / downsample_factor[0], 'int64') out_width = T.cast(width / downsample_factor[1], 'int64') grid = _meshgrid(out_height, out_width) # Transform A x (x_t, y_t, 1)^T -> (x_s, y_s) T_g = T.dot(theta, grid) x_s = T_g[:, 0] y_s = T_g[:, 1] x_s_flat = x_s.flatten() y_s_flat = y_s.flatten() # dimshuffle input to (bs, height, width, channels) input_dim = input.dimshuffle(0, 2, 3, 1) input_transformed = _interpolate( input_dim, x_s_flat, y_s_flat, out_height, out_width) output = T.reshape( input_transformed, (num_batch, out_height, out_width, num_channels)) output = output.dimshuffle(0, 3, 1, 2) # dimshuffle to conv format return output
def _step(self,y_tm1,yz_t, yr_t, yh_t,y_m,s_tm1,h,x_m): # attention pctx__=T.dot(h,self.W_ha)+T.dot(s_tm1,self.W_sa)[None,:,:] pctx__=self.activation(pctx__) e=T.dot(pctx__,self.U_att)+self.b_att e=T.exp(e.reshape((e.shape[0],e.shape[1]))) e=e/e.sum(0, keepdims=True) e=e*x_m c=(h*e[:,:,None]).sum(0) z = hard_sigmoid(yz_t + T.dot(s_tm1, self.U_z)+T.dot(c,self.W_cs)) r = hard_sigmoid(yr_t + T.dot(s_tm1, self.U_r)+T.dot(c,self.W_cs)) hh_t = self.activation(yh_t + T.dot(r * s_tm1, self.U_h)+T.dot(c,self.W_cy)) s_t = z * s_tm1 + (1 - z) * hh_t s_t = (1. - y_m)[:,None] * s_tm1 + y_m[:,None] * s_t logit=self.activation(T.dot(s_t, self.W_hl)+T.dot(y_tm1, self.W_yl)+T.dot(c, self.W_cl)) return T.cast(s_t,dtype =theano.config.floatX),T.cast(logit,dtype =theano.config.floatX)
def get_monitoring_channels(self, model, X, Y = None): rval = OrderedDict() history = model.mf(X, return_history = True) q = history[-1] if self.supervised: assert Y is not None Y_hat = q[-1] true = T.argmax(Y,axis=1) pred = T.argmax(Y_hat, axis=1) #true = Print('true')(true) #pred = Print('pred')(pred) wrong = T.neq(true, pred) err = T.cast(wrong.mean(), X.dtype) rval['misclass'] = err if len(model.hidden_layers) > 1: q = model.mf(X, Y = Y) pen = model.hidden_layers[-2].upward_state(q[-2]) Y_recons = model.hidden_layers[-1].mf_update(state_below = pen) pred = T.argmax(Y_recons, axis=1) wrong = T.neq(true, pred) rval['recons_misclass'] = T.cast(wrong.mean(), X.dtype) return rval
def __init__(self, dataset_path, batch_size=500, instance_weights_path=None): L.info("Initializing dataset from: " + os.path.abspath(dataset_path)) # Reading parameters from the mmap file fp = np.memmap(dataset_path, dtype='int32', mode='r') self.num_samples = fp[0] self.ngram = fp[1] fp = fp.reshape((self.num_samples + 3, self.ngram)) self.vocab_size = fp[1,0] self.num_classes = fp[2,0] # Setting minibatch size and number of mini batches self.batch_size = batch_size self.num_batches = int(M.ceil(self.num_samples / self.batch_size)) # Reading the matrix of samples x = fp[3:,0:self.ngram - 1] # Reading the context indices y = fp[3:,self.ngram - 1] # Reading the output word index self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32') self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32') self.is_weighted = False if instance_weights_path: instance_weights = np.loadtxt(instance_weights_path) U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.") self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX) self.is_weighted = True L.info(' #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % ( U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches) ) )
def binarization(W,H,binary=True,deterministic=False,stochastic=False,srng=None): # (deterministic == True) <-> test-time <-> inference-time if not binary or (deterministic and stochastic): # print("not binary") Wb = W else: # [-1,1] -> [0,1] Wb = hard_sigmoid(W/H) # Wb = T.clip(W/H,-1,1) # Stochastic BinaryConnect if stochastic: # print("stoch") Wb = T.cast(srng.binomial(n=1, p=Wb, size=T.shape(Wb)), theano.config.floatX) # Deterministic BinaryConnect (round to nearest) else: # print("det") Wb = T.round(Wb) # 0 or 1 -> -1 or 1 Wb = T.cast(T.switch(Wb,H,-H), theano.config.floatX) return Wb
def shared_dataset(data_xy, borrow=True): """ Function that loads the dataset into shared variables The reason we store our dataset in shared variables is to allow Theano to copy it into the GPU memory (when code is run on GPU). Since copying data into the GPU is slow, copying a minibatch everytime is needed (the default behaviour if the data is not in a shared variable) would lead to a large decrease in performance. """ data_x, data_y = data_xy shared_x = theano.shared(np.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) shared_y = theano.shared(np.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) # one-hot encoded labels as {-1, 1} n_classes = len(np.unique(data_y)) # dangerous? y1 = -1 * np.ones((data_y.shape[0], n_classes)) y1[np.arange(data_y.shape[0]), data_y] = 1 shared_y1 = theano.shared(np.asarray(y1, dtype=theano.config.floatX), borrow=borrow) # When storing data on the GPU it has to be stored as floats # therefore we will store the labels as ``floatX`` as well # (``shared_y`` does exactly that). But during our computations # we need them as ints (we use labels as index, and if they are # floats it doesn't make sense) therefore instead of returning # ``shared_y`` we will have to cast it to int. This little hack # lets ous get around this issue return shared_x, T.cast(shared_y, 'int32'), T.cast(shared_y1,'int32')
def compute_hard_windows(self, image_shape, location, scale): # find topleft(front) and bottomright(back) corners for each patch a = location - 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale) b = location + 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale) # grow by three patch pixels a -= self.kernel.k_sigma_radius(self.cutoff, scale) b += self.kernel.k_sigma_radius(self.cutoff, scale) # clip to fit inside image and have nonempty window a = T.clip(a, 0, image_shape - 1) b = T.clip(b, a + 1, image_shape) if self.batched_window: # take the bounding box of all windows; now the slices # will have the same length for each sample and scan can # be avoided. comes at the cost of typically selecting # more of the input. a = a.min(axis=0, keepdims=True) b = b.max(axis=0, keepdims=True) # make integer a = T.cast(T.floor(a), 'int16') b = T.cast(T.ceil(b), 'int16') return a, b
def forward(self,input_org,train=True,update_batch_stat=True,finetune=False): print "Layer/BatchNormalization" ldim,cdim,rdim = self._internal_shape(input_org) input = input_org.reshape((ldim,cdim,rdim)) if (train): mean = T.mean(input, axis=(0, 2), keepdims=True ) var = T.mean((input-mean)**2, axis=(0, 2), keepdims=True) if(update_batch_stat): finetune_N = theano.clone(self.finetune_N, share_inputs=False) if(finetune): finetune_N.default_update = finetune_N+1 ratio = T.cast(1-1.0/(finetune_N+1),theano.config.floatX) else: finetune_N.default_update = 0 ratio = self.moving_avg_ratio m = ldim*rdim scale = T.cast(m/(m-1.0),theano.config.floatX) est_mean = theano.clone(self.est_mean, share_inputs=False) est_var = theano.clone(self.est_var, share_inputs=False) est_mean.default_update = T.cast(ratio*self.est_mean + (1-ratio)*mean,theano.config.floatX) est_var.default_update = T.cast(ratio*self.est_var + (1-ratio)*scale*var,theano.config.floatX) mean += 0 * est_mean var += 0 * est_var output = self._pbc(self.gamma) * (input - self._pbc(mean)) \ / T.sqrt(1e-6+self._pbc(var)) + self._pbc(self.beta) else: output = self._pbc(self.gamma) * (input - self._pbc(self.est_mean)) \ / T.sqrt(1e-6+self._pbc(self.est_var)) + self._pbc(self.beta) return output.reshape(input_org.shape)
def get_cost_grads_updates(self, x): ha, h = self.network.propup(x, noisestd=self.train_hypers['noise_std']) q = 0.9*self.q + 0.1*h.mean(axis=0) ### get correlation matrix for examples # C = T.dot(x.T, h) / x.shape[0] x_std = x.std(axis=0) h_std = h.std(axis=0) xz = (x - x.mean(0)) / (x.std(0) + 1e-2) hz = (h - h.mean(0)) / (h.std(0) + 1e-2) # C = T.dot(xz.T, hz) / x.shape[0] C = T.dot(xz.T, hz) lamb = T.cast(self.train_hypers['lamb'], self.dtype) rho = T.cast(self.train_hypers['rho'], self.dtype) # cost = (C**2).sum() + lamb*(T.abs_(q - rho)).sum() # cost = (C**2).sum() / x.shape[0]**2 + lamb*(T.abs_(q - rho)).sum() cost = (C**2).sum() / x.shape[0]**2 + lamb*((q - rho)**2).sum() # lamb = T.cast(self.train_hypers['lamb'], self.dtype) # rho = T.cast(self.train_hypers['rho'], self.dtype) # cost = ((x - y)**2).mean(axis=0).sum() + lamb*(T.abs_(q - rho)).sum() updates = {self.q: q} return cost, self.grads(cost), updates
def local_gpua_advanced_incsubtensor(node, context_name): context = get_context(context_name) # This is disabled on non-cuda contexts if context.kind != 'cuda': return None x, y, ilist = node.inputs # Gpu Ops needs both inputs to have the same dtype if (x.type.dtype != y.type.dtype): dtype = scalar.upcast(x.type.dtype, y.type.dtype) if x.type.dtype != dtype: x = tensor.cast(x, dtype) if y.type.dtype != dtype: y = tensor.cast(y, dtype) set_instead_of_inc = node.op.set_instead_of_inc compute_capability = int(context.bin_id[-2]) if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2): return GpuAdvancedIncSubtensor1( set_instead_of_inc=set_instead_of_inc) else: return GpuAdvancedIncSubtensor1_dev20( set_instead_of_inc=set_instead_of_inc)
def test_elemwise_composite_float64(): # test that we don't fuse composite elemwise with float64 somewhere inside # nvcc by default downcast them to float32. We would need to tell him not # to do so, but that possible only on some device. a = tensor.fmatrix() b = tensor.fmatrix() av = theano._asarray(numpy.random.rand(4, 4), dtype='float32') bv = numpy.ones((4, 4), dtype='float32') def get_all_basic_scalar(composite_op): l = [] for i in composite_op.env.toposort(): if isinstance(i, theano.scalar.Composite): l += get_all_basic_scalar(i) else: l.append(i) return l for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'), mode_with_gpu.excluding('elemwise_fusion')]: f = pfunc([a, b], tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2, b), 'float32'), mode=mode) out = f(av, bv) assert numpy.all(out == ((av ** 2) < bv)) for node in f.maker.env.toposort(): if isinstance(node.op, cuda.GpuElemwise): if isinstance(node.op.scalar_op, theano.scalar.Composite): scals = get_all_basic_scalar(node.op.scalar_op) for s in scals: assert not any([i.type.dtype == 'float64' for i in s.inputs + s.outputs])
def f1_score(self, y, labels=[0, 2]): """ Mean F1 score between two classes (positive and negative as specified by the labels array). """ y_tr = y y_pr = self.y_pred correct = T.eq(y_tr, y_pr) wrong = T.neq(y_tr, y_pr) label = labels[0] tp_neg = T.sum(correct * T.eq(y_tr, label)) fp_neg = T.sum(wrong * T.eq(y_pr, label)) fn_neg = T.sum(T.eq(y_tr, label) * T.neq(y_pr, label)) tp_neg = T.cast(tp_neg, theano.config.floatX) prec_neg = tp_neg / T.maximum(1, tp_neg + fp_neg) recall_neg = tp_neg / T.maximum(1, tp_neg + fn_neg) f1_neg = 2. * prec_neg * recall_neg / T.maximum(1, prec_neg + recall_neg) label = labels[1] tp_pos = T.sum(correct * T.eq(y_tr, label)) fp_pos = T.sum(wrong * T.eq(y_pr, label)) fn_pos = T.sum(T.eq(y_tr, label) * T.neq(y_pr, label)) tp_pos = T.cast(tp_pos, theano.config.floatX) prec_pos = tp_pos / T.maximum(1, tp_pos + fp_pos) recall_pos = tp_pos / T.maximum(1, tp_pos + fn_pos) f1_pos = 2. * prec_pos * recall_pos / T.maximum(1, prec_pos + recall_pos) return 0.5 * (f1_pos + f1_neg) * 100
def __init__(self, input, image_shape, cropsize, rand, mirror, flag_rand): ''' The random mirroring and cropping in this function is done for the whole batch. ''' # trick for random mirroring mirror = input[:, :, ::-1, :] input = T.concatenate([input, mirror], axis=0) # crop images center_margin = (image_shape[2] - cropsize) / 2 if flag_rand: mirror_rand = T.cast(rand[2], 'int32') crop_xs = T.cast(rand[0] * center_margin * 2, 'int32') crop_ys = T.cast(rand[1] * center_margin * 2, 'int32') else: mirror_rand = 0 crop_xs = center_margin crop_ys = center_margin self.output = input[mirror_rand * 3:(mirror_rand + 1) * 3, :, :, :] self.output = self.output[ :, crop_xs:crop_xs + cropsize, crop_ys:crop_ys + cropsize, :] print "data layer with shape_in: " + str(image_shape)
def cost(self): known_grads = None xd = self.z.reshape((self.z.shape[0]*self.z.shape[1],self.z.shape[2])) epsilon = numpy.float32(1e-10) # cross-entropy nll, _ = T.nnet.crossentropy_softmax_1hot(x=xd[self.i], y_idx=self.y_data_flat[self.i]) ce = T.sum(nll) # entropy def entropy(p, axis=None): if self.use_max and axis is not None: q = p.dimshuffle(axis, *(range(axis) + range(axis+1,p.ndim))) #return -T.mean(T.log(T.maximum(T.max(q,axis=0),epsilon))) return -T.mean(T.max(q,axis=0)+epsilon) + T.log(T.cast(p.shape[axis],'float32')) else: return -T.mean(p*T.log(p+epsilon)) + T.log(T.cast(p.shape[axis],'float32')) ez = T.exp(self.z) * T.cast(self.index.dimshuffle(0,1,'x').repeat(self.z.shape[2],axis=2), 'float32') et = entropy(ez / T.maximum(epsilon,T.sum(ez,axis=0,keepdims=True)),axis=0) eb = entropy(ez / T.maximum(epsilon,T.sum(ez,axis=1,keepdims=True)),axis=1) ed = entropy(ez / T.maximum(epsilon,T.sum(ez,axis=2,keepdims=True)),axis=2) # maximize entropy across T and B and minimize entropy across D e = self.e_d * ed - (self.e_t * et + self.e_b * eb) / numpy.float32(self.e_t + self.e_b) import theano.ifelse if self.train_flag: return theano.ifelse.ifelse(T.cast(self.xflag,'int8'),e,ce), known_grads else: return ce, known_grads
def entropy(p, axis=None): if self.use_max and axis is not None: q = p.dimshuffle(axis, *(range(axis) + range(axis+1,p.ndim))) #return -T.mean(T.log(T.maximum(T.max(q,axis=0),epsilon))) return -T.mean(T.max(q,axis=0)+epsilon) + T.log(T.cast(p.shape[axis],'float32')) else: return -T.mean(p*T.log(p+epsilon)) + T.log(T.cast(p.shape[axis],'float32'))
def compute_crop_matrices(self, locations, scales, Is): Ws = [] for axis in xrange(self.n_spatial_dims): m = T.cast(self.image_shape[axis], 'float32') n = T.cast(self.patch_shape[axis], 'float32') I = Is[axis].dimshuffle('x', 0, 'x') # (1, hardcrop_dim, 1) J = T.arange(n).dimshuffle('x', 'x', 0) # (1, 1, patch_dim) location = locations[:, axis].dimshuffle(0, 'x', 'x') # (batch_size, 1, 1) scale = scales [:, axis].dimshuffle(0, 'x', 'x') # (batch_size, 1, 1) # map patch index into image index space J = (J - 0.5*n) / scale + location # (batch_size, 1, patch_dim) # compute squared distances between image index and patch # index in the current dimension: # dx**2 = (i - j)*(i - j) # where i is image index # j is patch index mapped into image space # = i**2 + j**2 -2ij # = I**2 + J**2 -2IJ' for all i,j in one swoop IJ = I * J # (batch_size, hardcrop_dim, patch_dim) dx2 = I**2 + J**2 - 2*IJ # (batch_size, hardcrop_dim, patch_dim) Ws.append(self.kernel.density(dx2, scale)) return Ws
def resample_step(self): idx=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T s_samp=T.sum(self.s_now*T.addbroadcast(idx,1),axis=0) h_samp=T.sum(self.h_now*T.addbroadcast(idx,1),axis=0) return T.cast(s_samp,'float32'), T.cast(h_samp,'float32')
def _transform(theta, input, downsample_factor): num_batch, num_channels, height, width = input.shape theta = T.reshape(theta, (-1, 1)) # grid of (x_t, y_t, 1), eq (1) in ref [1] out_height = T.cast(height / downsample_factor[0], 'int64') out_width = T.cast(width / downsample_factor[1], 'int64') grid = _meshgrid(out_height, out_width) zeros = T.zeros_like(theta) padded_theta = T.concatenate([theta, zeros], axis=1) T_g = padded_theta.dimshuffle(0, 1, 'x') + grid.dimshuffle('x', 0, 1) x_s = T_g[:, 0] y_s = T_g[:, 1] x_s_flat = x_s.flatten() y_s_flat = y_s.flatten() # dimshuffle input to (bs, height, width, channels) input_dim = input.dimshuffle(0, 2, 3, 1) input_transformed = _interpolate( input_dim, x_s_flat, y_s_flat, out_height, out_width) output = T.reshape( input_transformed, (num_batch, out_height, out_width, num_channels)) output = output.dimshuffle(0, 3, 1, 2) # dimshuffle to conv format return output
def __build_backprop(self): y_init = self.outside_world.y_data_one_hot # initialize y=y_data h_init = my_op(2 * (T.dot(rho(y_init), self.W2.T) + self.bh)) # initialize h by backward propagation x_init = my_op(T.dot(rho(h_init), self.W1.T) + self.bx) # initialize x by backward propagation Delta_y = y_init - self.y Delta_h = h_init - self.h Delta_x = x_init - self.x by_dot = T.mean(Delta_y, axis=0) W2_dot = T.dot(self.rho_h.T, Delta_y) / T.cast(self.x.shape[0], dtype=theano.config.floatX) bh_dot = T.mean(Delta_h, axis=0) W1_dot = T.dot(self.rho_x.T, Delta_h) / T.cast(self.x.shape[0], dtype=theano.config.floatX) bx_dot = T.mean(Delta_x, axis=0) alpha = T.fscalar('alpha') by_new = self.by + alpha * by_dot W2_new = self.W2 + alpha * W2_dot bh_new = self.bh + alpha * bh_dot W1_new = self.W1 + alpha * W1_dot bx_new = self.bx + alpha * bx_dot updates_states = [(self.x, x_init), (self.h, h_init), (self.y, y_init)] updates_params = [(self.by, by_new), (self.W2, W2_new), (self.bh, bh_new), (self.W1, W1_new)] backprop = theano.function( inputs=[alpha], outputs=[], updates=updates_states+updates_params ) return backprop
def compute_f_mu(x, t, params): [centers, spreads, biases, M, b]=params diffs=x.dimshuffle(0,1,2,'x')-centers.dimshuffle('x','x',0,1) scaled_diffs=(diffs**2)*T.exp(spreads).dimshuffle('x','x',0,1) exp_terms=T.sum(scaled_diffs,axis=2)+biases.dimshuffle('x','x',0)*0.0 h=T.exp(-exp_terms) sumact=T.sum(h,axis=2) #Normalization hnorm=h/sumact.dimshuffle(0,1,'x') z=T.dot(hnorm,M) z=T.reshape(z,(t.shape[0],t.shape[1],ntgates,nx))+b.dimshuffle('x','x',0,1) #nt by nb by ntgates by nx #z=z+T.reshape(x,(t.shape[0],t.shape[1],1,nx)) tpoints=T.cast(T.arange(ntgates),'float32')/T.cast(ntgates-1,'float32') tpoints=T.reshape(tpoints, (1,1,ntgates)) #tgating=T.exp(T.dot(t,muWT)+mubT) #nt by nb by ntgates tgating=T.exp(-kT*(tpoints-t)**2) tgating=tgating/T.reshape(T.sum(tgating, axis=2),(t.shape[0], t.shape[1], 1)) tgating=T.reshape(tgating,(t.shape[0],t.shape[1],ntgates,1)) mult=z*tgating out=T.sum(mult,axis=2) #out=out+x return T.cast(out,'float32')
def _step(self, x_tm1, u_tm1, inputs, x_prior, u_prior, *args): # x_prior are previous states # u_prior are causes from above outputs = self.activation(T.dot(x_tm1, self.W)) rec_error = T.sqr(inputs - outputs).sum() causes = (1 + T.exp(-T.dot(u_tm1, self.V))) * .5 if self.pool_flag: batch_size = inputs.shape[0] dim = causes.shape[1] imgs = T.cast(T.sqrt(dim), 'int64') causes_up = causes.reshape( (batch_size, 1, imgs, imgs)).repeat( self.pool_size, axis=2).repeat(self.pool_size, axis=3).flatten(ndim=2) else: causes_up = causes x = _IstaStep(rec_error, x_tm1, lambdav=self.gamma*causes_up, x_prior=x_prior) if self.pool_flag: dim = T.cast(T.sqrt(x.shape[1]), 'int64') x_pool = x.reshape((batch_size, 1, dim, dim)) x_pool = max_pool_2d(x_pool, ds=(self.pool_size, )*2).flatten(ndim=2) else: x_pool = x prev_u_cost = .01 * self.gamma * T.sqr(u_tm1-u_prior).sum() u_cost = causes * abs(x_pool) * self.gamma + prev_u_cost u = _IstaStep(u_cost.sum(), u_tm1, lambdav=self.gamma) causes = (1 + T.exp(-T.dot(u, self.V))) * .5 u_cost = causes * abs(x_pool) * self.gamma return (x, u, u_cost, outputs)
def get_monitoring_channels(self, model, data, **kwargs): rval = OrderedDict() space, sources = self.get_data_specs(model) X_data, X_condition = data m = X_data.shape[space.get_batch_axis()] G, D = model.generator, model.discriminator # Compute false negatives w/ empirical samples y_hat = D.fprop((X_data, X_condition)) rval["false_negatives"] = T.cast((y_hat < 0.5).mean(), "float32") # Compute false positives w/ generated sample G_conditional_data = self.condition_distribution.sample(m) samples = G.sample(G_conditional_data) y_hat = D.fprop((samples, G_conditional_data)) rval["false_positives"] = T.cast((y_hat > 0.5).mean(), "float32") # y = T.alloc(0., m, 1) cost = D.cost_from_X(((samples, G_conditional_data), y_hat)) sample_grad = T.grad(-cost, samples) rval["sample_grad_norm"] = T.sqrt(T.sqr(sample_grad).sum()) _S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data) if model.monitor_inference and i_obj != 0: rval["objective_i"] = i_obj if model.monitor_discriminator: rval["objective_d"] = d_obj if model.monitor_generator: rval["objective_g"] = g_obj rval["now_train_generator"] = self.now_train_generator return rval
def loss(x_0, n, t, params): muparams=params[:5] covparams=params[5:10] tpoints=T.cast(T.arange(nsteps),'float32')/T.cast(nsteps,'float32') betas=compute_betas(params[-1],tpoints) def step(nt, bt, xt): mean=xt*T.sqrt(1.0-bt) xnew=T.cast(mean+T.sqrt(bt)*nt,'float32') losst=T.cast(0.5*T.mean(T.sum((((mean-xnew)**2)/bt+T.log(np.pi*2.0*bt)),axis=1)),'float32') return xnew, losst [xhist, fwdlosshist],updates=theano.scan(fn=step, outputs_info=[x_0, None], sequences=[n, betas], n_steps=nsteps) forward_loss=-T.mean(fwdlosshist)+0.5*T.mean(T.sum((xhist[-1]**2+T.log(np.pi*2.0)),axis=1)) #f_mu=compute_f_mu(xhist,t,muparams) #f_cov=compute_f_cov(xhist,t,covparams) #diffs=(f_mu[2:]-xhist[:-1])**2 #gaussian_terms=T.sum(diffs*(1.0/f_cov[1:].dimshuffle(0,1,'x')),axis=2) #det_terms=T.sum(T.log(f_cov[1:].dimshuffle(0,1,'x')),axis=2) f_mu=compute_f_mu(xhist,t,muparams)+xhist*(T.sqrt(1.0-betas)).dimshuffle(0,'x','x') f_cov=compute_f_cov(xhist,t,covparams)*betas.dimshuffle(0,'x') xhist=T.concatenate([x_0.dimshuffle('x',0,1), xhist],axis=0) diffs=(f_mu-xhist[:-1])**2 gaussian_terms=T.sum(diffs*(1.0/f_cov.dimshuffle(0,1,'x')),axis=2) det_terms=T.sum(T.log(f_cov.dimshuffle(0,1,'x')),axis=2) reverse_loss=T.mean(T.mean(gaussian_terms+det_terms)) return reverse_loss+forward_loss
def cost(self): """ :param y: shape (time*batch,) -> label :return: error scalar, known_grads dict """ y_f = T.cast(T.reshape(self.y_data_flat, (self.y_data_flat.shape[0] * self.y_data_flat.shape[1]), ndim = 1), 'int32') known_grads = None if self.loss == 'sprint': if not isinstance(self.sprint_opts, dict): import json self.sprint_opts = json.loads(self.sprint_opts) assert isinstance(self.sprint_opts, dict), "you need to specify sprint_opts in the output layer" if self.exp_normalize: log_probs = T.log(self.p_y_given_x) else: log_probs = self.z sprint_error_op = SprintErrorSigOp(self.attrs.get("target", "classes"), self.sprint_opts) err, grad = sprint_error_op(log_probs, T.sum(self.index, axis=0)) err = err.sum() if self.loss_like_ce: y_ref = T.clip(self.p_y_given_x - grad, numpy.float32(0), numpy.float32(1)) err = -T.sum(T.log(T.pow(self.p_y_given_x, y_ref)) * T.cast(self.index, "float32").dimshuffle(0, 1, 'x')) if self.ce_smoothing: err *= numpy.float32(1.0 - self.ce_smoothing) grad *= numpy.float32(1.0 - self.ce_smoothing) if not self.prior_scale: # we kept the softmax bias as it was nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i]) else: # assume that we have subtracted the bias by the log priors beforehand assert self.log_prior is not None # In this case, for the CE calculation, we need to add the log priors again. y_m_prior = T.reshape(self.z + numpy.float32(self.prior_scale) * self.log_prior, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2) nll, pcx = T.nnet.crossentropy_softmax_1hot(x=y_m_prior[self.i], y_idx=self.y_data_flat[self.i]) ce = numpy.float32(self.ce_smoothing) * T.sum(nll) err += ce grad += T.grad(ce, self.z) known_grads = {self.z: grad} return err, known_grads elif self.loss == 'ctc': from theano.tensor.extra_ops import cpu_contiguous err, grad, priors = CTCOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc()) known_grads = {self.z: grad} return err.sum(), known_grads, priors.sum(axis=0) elif self.loss == 'ce_ctc': y_m = T.reshape(self.z, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2) p_y_given_x = T.nnet.softmax(y_m) #pcx = p_y_given_x[(self.i > 0).nonzero(), y_f[(self.i > 0).nonzero()]] pcx = p_y_given_x[self.i, self.y_data_flat[self.i]] ce = -T.sum(T.log(pcx)) return ce, known_grads elif self.loss == 'ctc2': from NetworkCtcLayer import ctc_cost, uniq_with_lengths, log_sum max_time = self.z.shape[0] num_batches = self.z.shape[1] time_mask = self.index.reshape((max_time, num_batches)) y_batches = self.y_data_flat.reshape((max_time, num_batches)) targets, seq_lens = uniq_with_lengths(y_batches, time_mask) log_pcx = self.z - log_sum(self.z, axis=0, keepdims=True) err = ctc_cost(log_pcx, time_mask, targets, seq_lens) return err, known_grads
def __init__(self, d_v, d_e, d_t, optimizer, optimizer_args, np_rng, th_rng, n_classes=0, encoder_layers=1, generator_layers=0, generator_transform=None, use_interactions=False, clip_gradients=False, init_bias=None, train_bias=False, scale=6.0, encode_labels=False, l1_inter_factor=1.0, time_penalty=False, encoder_shortcut=False, generator_shortcut=False): self.d_v = d_v # vocabulary size self.d_e = d_e # dimensionality of encoder self.d_t = d_t # number of topics self.n_classes = n_classes # number of classes assert encoder_layers == 1 or encoder_layers == 2 self.n_encoder_layers = encoder_layers assert generator_layers == 0 or generator_layers == 1 or generator_layers == 2 or generator_layers == 4 self.n_generator_layers = generator_layers # set various options self.generator_transform = generator_transform # transform to apply after the generator self.use_interactions = use_interactions # use interactions between topics and labels self.encode_labels = encode_labels # feed labels into the encoder self.l1_inter_factor = l1_inter_factor # factor by which to multiply L1 penalty on interactions self.encoder_shortcut = encoder_shortcut self.generator_shortcut = generator_shortcut # create parameter matrices and biases self.W_encoder_1 = common_theano.init_param('W_encoder_1', (d_e, d_v), np_rng, scale=scale) self.b_encoder_1 = common_theano.init_param('b_encoder_1', (d_e, ), np_rng, scale=0.0) if n_classes > 1: self.W_encoder_label = common_theano.init_param('W_encoder_label', (d_e, n_classes), np_rng, scale=scale) else: self.W_encoder_label = common_theano.init_param( 'W_encoder_label', (d_e, n_classes), np_rng, values=np.zeros((d_e, n_classes), dtype=np.float32)) self.W_encoder_2 = common_theano.init_param('W_encoder_2', (d_e, d_e), np_rng, scale=scale) self.b_encoder_2 = common_theano.init_param('b_encoder_2', (d_e, ), np_rng, scale=0.0) self.W_encoder_shortcut = common_theano.init_param( 'W_encoder_shortcut', (d_e, d_v), np_rng, scale=scale) self.W_mu = common_theano.init_param('W_mu', (d_t, d_e), np_rng, scale=scale) self.b_mu = common_theano.init_param('b_mu', (d_t, ), np_rng, scale=0.0) self.W_sigma = common_theano.init_param('W_sigma', (d_t, d_e), np_rng, scale=scale, values=np.zeros((d_t, d_e))) self.b_sigma = common_theano.init_param('b_sigma', (d_t, ), np_rng, scale=0.0, values=np.array([-4] * d_t)) self.W_generator_1 = common_theano.init_param('W_generator_1', (d_t, d_t), np_rng, scale=scale) self.b_generator_1 = common_theano.init_param('b_generator_1', (d_t, ), np_rng, scale=0.0) self.W_generator_2 = common_theano.init_param('W_generator_2', (d_t, d_t), np_rng, scale=scale) self.b_generator_2 = common_theano.init_param('b_generator_2', (d_t, ), np_rng, scale=0.0) self.W_generator_3 = common_theano.init_param('W_generator_3', (d_t, d_t), np_rng, scale=scale) self.b_generator_3 = common_theano.init_param('b_generator_3', (d_t, ), np_rng, scale=0.0) self.W_generator_4 = common_theano.init_param('W_generator_4', (d_t, d_t), np_rng, scale=scale) self.b_generator_4 = common_theano.init_param('b_generator_4', (d_t, ), np_rng, scale=0.0) self.W_decoder = common_theano.init_param('W_decoder', (d_v, d_t), np_rng, scale=scale) self.b_decoder = common_theano.init_param('b_decoder', (d_v, ), np_rng, scale=0.0) self.W_decoder_label = common_theano.init_param('W_decoder_label', (d_v, n_classes), np_rng, scale=scale) self.W_decoder_inter = common_theano.init_param('W_decoder_inter', (d_v, d_t * n_classes), np_rng, scale=scale) # set the decoder bias to the background frequency if init_bias is not None: self.b_decoder = common_theano.init_param('b_decoder', (d_v, ), np_rng, values=init_bias) # create basic sets of parameters which we will use to tell the model what to update self.params = [ self.W_encoder_1, self.b_encoder_1, self.W_mu, self.b_mu, self.W_sigma, self.b_sigma, self.W_decoder ] self.param_shapes = [(d_e, d_v), (d_e, ), (d_t, d_e), (d_t, ), (d_t, d_e), (d_t, ), (d_v, d_t)] self.encoder_params = [ self.W_encoder_1, self.b_encoder_1, self.W_mu, self.b_mu, self.W_sigma, self.b_sigma ] self.encoder_param_shapes = [(d_e, d_v), (d_e, ), (d_t, d_e), (d_t, ), (d_t, d_e), (d_t, )] self.generator_params = [] self.generator_param_shapes = [] # add additional parameters to sets, depending on configuration if train_bias: self.params.append(self.b_decoder) self.param_shapes.append((d_v, )) self.decoder_params = [self.W_decoder, self.b_decoder] self.decoder_param_shapes = [(d_v, d_t), (d_v, )] else: self.decoder_params = [self.W_decoder] self.decoder_param_shapes = [(d_v, d_t)] # add parameters for labels (covariates) if self.n_classes > 1: self.params.append(self.W_decoder_label) self.param_shapes.append((d_v, n_classes)) self.decoder_params.extend([self.W_decoder_label]) self.decoder_param_shapes.extend([(d_v, n_classes)]) if use_interactions: self.params.append(self.W_decoder_inter) self.param_shapes.append((d_v, d_t * n_classes)) self.decoder_params.extend([self.W_decoder_inter]) self.decoder_param_shapes.extend([(d_v, d_t * n_classes)]) if encode_labels: self.params.append(self.W_encoder_label) self.param_shapes.append((d_e, n_classes)) self.encoder_params.extend([self.W_encoder_label]) self.encoder_param_shapes.extend([(d_e, n_classes)]) self.label_only_params = [self.W_decoder_label] self.label_only_param_shapes = [(d_v, n_classes)] # add encoder parameters depending on number of layers if self.n_encoder_layers > 1: self.params.extend([self.W_encoder_2, self.b_encoder_2]) self.param_shapes.extend([(d_e, d_e), (d_e, )]) self.encoder_params.extend([self.W_encoder_2, self.b_encoder_2]) self.encoder_param_shapes.extend([(d_e, d_e), (d_e, )]) if self.encoder_shortcut: self.params.extend([self.W_encoder_shortcut]) self.param_shapes.extend([(d_e, d_v)]) self.encoder_params.extend([self.W_encoder_shortcut]) self.encoder_param_shapes.extend([(d_e, d_v)]) # add generator parameters depending on number of layers if self.n_generator_layers > 0: self.params.extend([self.W_generator_1, self.b_generator_1]) self.param_shapes.extend([(d_t, d_t), (d_t, )]) self.generator_params.extend( [self.W_generator_1, self.b_generator_1]) self.generator_param_shapes.extend([(d_t, d_t), (d_t, )]) if self.n_generator_layers > 1: self.params.extend([self.W_generator_2, self.b_generator_2]) self.param_shapes.extend([(d_t, d_t), (d_t, )]) self.generator_params.extend( [self.W_generator_2, self.b_generator_2]) self.generator_param_shapes.extend([(d_t, d_t), (d_t, )]) if self.n_generator_layers > 2: self.params.extend([ self.W_generator_3, self.b_generator_3, self.W_generator_4, self.b_generator_4 ]) self.param_shapes.extend([(d_t, d_t), (d_t, ), (d_t, d_t), (d_t, )]) self.generator_params.extend([ self.W_generator_3, self.b_generator_3, self.W_generator_4, self.b_generator_4 ]) self.generator_param_shapes.extend([(d_t, d_t), (d_t, ), (d_t, d_t), (d_t, )]) # declare variables that will be given as inputs to functions to be declared below x = T.vector('x', dtype=theano.config.floatX ) # normalized vector of counts for one item y = T.vector( 'y', dtype=theano.config.floatX) # vector of labels for one item indices = T.ivector( 'x') # vector of vocab indices (easier to evaluate log prob) lr = T.fscalar('lr') # learning rate l1_strength = T.fscalar('l1_strength') # l1_strength kl_strength = T.fscalar('kl_strength') # l1_strength n_words = T.shape(indices) # the two variables below are just for debugging n_words_print = theano.printing.Print('n_words')( T.shape(indices)[0]) # for debugging x_sum = theano.printing.Print('x_sum')(T.sum(x)) # for debugging # encode one item to mean and variance vectors mu, log_sigma_sq = self.encoder(x, y) # take a random sample from the corresponding multivariate normal h = self.sampler(mu, log_sigma_sq, th_rng) # compute the KL divergence from the prior KLD = -0.5 * T.sum(1 + log_sigma_sq - T.square(mu) - T.exp(log_sigma_sq)) # generate a document representation of dimensionality == n_topics r = self.generator(h) # decode back into a distribution over the vocabulary p_x_given_h = self.decoder(r, y) # evaluate the likelihood nll_term = -T.sum( T.log(p_x_given_h[T.zeros(n_words, dtype='int32'), indices]) + 1e-32) # compute the loss loss = nll_term + KLD * kl_strength # add an L1 penalty to the decoder terms if time_penalty and n_classes > 1: penalty = common_theano.col_diff_L1(l1_strength, self.W_decoder_label, n_classes) else: penalty = common_theano.L1(l1_strength, self.W_decoder) if n_classes > 1: penalty += common_theano.L1(l1_strength, self.W_decoder_label) if use_interactions: penalty += common_theano.L1( l1_strength * self.l1_inter_factor, self.W_decoder_inter) # declare some alternate function for decoding from the mean r_mu = self.generator(mu) p_x_given_x = self.decoder(r_mu, y) nll_term_mu = -T.sum( T.log(p_x_given_x[T.zeros(n_words, dtype='int32'), indices]) + 1e-32) # declare some alternate functions for pretraining from a fixed document representation (r) pretrain_r = T.vector('pretrain_r', dtype=theano.config.floatX) p_x_given_pretrain_h = self.decoder(pretrain_r, y) pretrain_loss = -T.sum( T.log(p_x_given_pretrain_h[T.zeros(n_words, dtype='int32'), indices]) + 1e-32) # declare some alternate functions for only using labels p_x_given_y_only = self.decoder_label_only(y) nll_term_y_only = -T.sum( T.log(p_x_given_y_only[T.zeros(n_words, dtype='int32'), indices]) + 1e-32) # compute gradients gradients = [ T.cast(T.grad(loss + penalty, param, disconnected_inputs='warn'), dtype=theano.config.floatX) for param in self.params ] encoder_gradients = [ T.cast(T.grad(loss, param, disconnected_inputs='warn'), dtype=theano.config.floatX) for param in self.encoder_params ] generator_gradients = [ T.cast(T.grad(loss, param, disconnected_inputs='warn'), dtype=theano.config.floatX) for param in self.generator_params ] decoder_gradients = [ T.cast(T.grad(loss + penalty, param, disconnected_inputs='warn'), dtype=theano.config.floatX) for param in self.decoder_params ] pretrain_gradients = [ T.cast(T.grad(pretrain_loss + penalty, param, disconnected_inputs='warn'), dtype=theano.config.floatX) for param in self.decoder_params ] label_only_gradients = [ T.cast(T.grad(nll_term_y_only + penalty, param, disconnected_inputs='warn'), dtype=theano.config.floatX) for param in self.label_only_params ] # optionally clip gradients if clip_gradients: gradients = common_theano.clip_gradients(gradients, 5) encoder_gradients = common_theano.clip_gradients( encoder_gradients, 5) generator_gradients = common_theano.clip_gradients( generator_gradients, 5) decoder_gradients = common_theano.clip_gradients( decoder_gradients, 5) pretrain_gradients = common_theano.clip_gradients( pretrain_gradients, 5) label_only_gradients = common_theano.clip_gradients( label_only_gradients, 5) # create the updates for various sets of parameters updates = optimizer(self.params, self.param_shapes, gradients, lr, optimizer_args) encoder_updates = optimizer(self.encoder_params, self.encoder_param_shapes, encoder_gradients, lr, optimizer_args) generator_updates = optimizer(self.generator_params, self.generator_param_shapes, generator_gradients, lr, optimizer_args) decoder_updates = optimizer(self.decoder_params, self.decoder_param_shapes, decoder_gradients, lr, optimizer_args) other_updates = optimizer( self.encoder_params + self.generator_params, self.encoder_param_shapes + self.generator_param_shapes, encoder_gradients + generator_gradients, lr, optimizer_args) pretrain_updates = optimizer(self.decoder_params, self.decoder_param_shapes, pretrain_gradients, lr, optimizer_args) label_only_updates = optimizer(self.label_only_params, self.label_only_param_shapes, label_only_gradients, lr, optimizer_args) # declare the available methods for this class self.test_input = theano.function(inputs=[x, indices], outputs=[n_words_print, x_sum]) self.train = theano.function( inputs=[x, indices, y, lr, l1_strength, kl_strength], outputs=[nll_term, KLD, penalty], updates=updates, on_unused_input='ignore') self.train_encoder = theano.function( inputs=[x, indices, y, lr, l1_strength, kl_strength], outputs=[nll_term, KLD, penalty], updates=encoder_updates, on_unused_input='ignore') self.train_generator = theano.function( inputs=[x, indices, y, lr, l1_strength, kl_strength], outputs=[nll_term, KLD, penalty], updates=generator_updates, on_unused_input='ignore') self.train_decoder = theano.function( inputs=[x, indices, y, lr, l1_strength, kl_strength], outputs=[nll_term, KLD, penalty], updates=decoder_updates, on_unused_input='ignore') self.train_not_decoder = theano.function( inputs=[x, indices, y, lr, l1_strength, kl_strength], outputs=[nll_term, KLD, penalty], updates=other_updates, on_unused_input='ignore') self.pretrain_decoder = theano.function( inputs=[indices, y, pretrain_r, lr, l1_strength, kl_strength], outputs=[pretrain_loss], updates=pretrain_updates, on_unused_input='ignore') self.encode = theano.function(inputs=[x, y], outputs=[mu, log_sigma_sq], on_unused_input='ignore') self.decode = theano.function(inputs=[pretrain_r, y], outputs=[p_x_given_pretrain_h], on_unused_input='ignore') self.sample = theano.function(inputs=[x, y], outputs=h, on_unused_input='ignore') self.get_mean_doc_rep = theano.function(inputs=[x, y], outputs=r_mu, on_unused_input='ignore') self.encode_and_decode = theano.function(inputs=[x, y], outputs=p_x_given_x, on_unused_input='ignore') self.neg_log_likelihood = theano.function(inputs=[x, indices, y], outputs=[nll_term, KLD], on_unused_input='ignore') self.neg_log_likelihood_mu = theano.function( inputs=[x, indices, y], outputs=[nll_term_mu, KLD], on_unused_input='ignore') self.train_label_only = theano.function( inputs=[indices, y, lr, l1_strength], outputs=[nll_term_y_only, penalty], updates=label_only_updates) self.neg_log_likelihood_label_only = theano.function( inputs=[indices, y], outputs=nll_term_y_only)
def dropout_layer(layer, p_dropout): srng = shared_randomstreams.RandomStreams( np.random.RandomState(0).randint(999999)) mask = srng.binomial(n=1, p=1 - p_dropout, size=layer.shape) return layer * T.cast(mask, theano.config.floatX)
def max_pool_3d(input, ds, ignore_border=False): """ Takes as input a N-D tensor, where N >= 3. It downscales the input video by the specified factor, by keeping only the maximum value of non-overlapping patches of size (ds[0],ds[1],ds[2]) (time, height, width) :type input: N-D theano tensor of input images. :param input: input images. Max pooling will be done over the 3 last dimensions. :type ds: tuple of length 3 :param ds: factor by which to downscale. (2,2,2) will halve the video in each dimension. :param ignore_border: boolean value. When True, (5,5,5) input with ds=(2,2,2) will generate a (2,2,2) output. (3,3,3) otherwise. """ if input.ndim < 3: raise NotImplementedError('max_pool_3d requires a dimension >= 3') # extract nr dimensions vid_dim = input.ndim # max pool in two different steps, so we can use the 2d implementation of # downsamplefactormax. First maxpool frames as usual. # Then maxpool the time dimension. Shift the time dimension to the third # position, so rows and cols are in the back # extract dimensions frame_shape = input.shape[-2:] # count the number of "leading" dimensions, store as dmatrix batch_size = T.prod(input.shape[:-2]) batch_size = T.shape_padright(batch_size, 1) # store as 4D tensor with shape: (batch_size,1,height,width) new_shape = T.cast(T.join(0, batch_size, T.as_tensor([ 1, ]), frame_shape), 'int32') input_4D = T.reshape(input, new_shape, ndim=4) # downsample mini-batch of videos in rows and cols output = T.signal.pool.pool_2d(input_4D, (ds[1], ds[2]), ignore_border) # restore to original shape outshape = T.join(0, input.shape[:-2], output.shape[-2:]) out = T.reshape(output, outshape, ndim=input.ndim) # now maxpool time # output (time, rows, cols), reshape so that time is in the back shufl = (list(range(vid_dim - 3)) + [vid_dim - 2] + [vid_dim - 1] + [vid_dim - 3]) input_time = out.dimshuffle(shufl) # reset dimensions vid_shape = input_time.shape[-2:] # count the number of "leading" dimensions, store as dmatrix batch_size = T.prod(input_time.shape[:-2]) batch_size = T.shape_padright(batch_size, 1) # store as 4D tensor with shape: (batch_size,1,width,time) new_shape = T.cast(T.join(0, batch_size, T.as_tensor([ 1, ]), vid_shape), 'int32') input_4D_time = T.reshape(input_time, new_shape, ndim=4) # downsample mini-batch of videos in time outtime = T.signal.pool.pool_2d(input_4D_time, (1, ds[0]), ignore_border) # output # restore to original shape (xxx, rows, cols, time) outshape = T.join(0, input_time.shape[:-2], outtime.shape[-2:]) shufl = (list(range(vid_dim - 3)) + [vid_dim - 1] + [vid_dim - 3] + [vid_dim - 2]) return T.reshape(outtime, outshape, ndim=input.ndim).dimshuffle(shufl)
update = [] # shared variables learning_rate = shared(float32(lr.init)) if use.mom: momentum = shared(float32(mom.momentum)) drop.p_vid = shared(float32(drop.p_vid_val)) drop.p_hidden = shared(float32(drop.p_hidden_val)) idx_mini = T.lscalar(name="idx_mini") # minibatch index idx_micro = T.lscalar(name="idx_micro") # microbatch index x = ndtensor(len(tr.in_shape))(name='x') # video input y = T.ivector(name='y') # labels x_ = _shared(empty(tr.in_shape)) y_ = _shared(empty(tr.batch_size)) y_int32 = T.cast(y_, 'int32') # in shape: #frames * gray/depth * body/hand * 4 maps import cPickle f = open(os.path.join(load_path, 'SK_normalization.pkl'), 'rb') SK_normalization = cPickle.load(f) Mean1 = SK_normalization['Mean1'] Std1 = SK_normalization['Std1'] f = open('CNN_normalization.pkl', 'rb') CNN_normalization = cPickle.load(f) Mean_CNN = CNN_normalization['Mean_CNN'] Std_CNN = CNN_normalization['Std_CNN'] # customized data loader for both video module and skeleton module loader = DataLoader_with_skeleton_normalisation(
def __init__(self, \ rng=None, \ Xd=None, \ prior_sigma=None, \ params=None, \ shared_param_dicts=None): # Setup a shared random generator for this network self.rng = RandStream(rng.randint(1000000)) # Grab the symbolic input matrix self.Xd = Xd self.prior_sigma = prior_sigma ##################################################### # Process user-supplied parameters for this network # ##################################################### self.params = params self.lam_l2a = params['lam_l2a'] if 'build_theano_funcs' in params: self.build_theano_funcs = params['build_theano_funcs'] else: self.build_theano_funcs = True if 'vis_drop' in params: self.vis_drop = params['vis_drop'] else: self.vis_drop = 0.0 if 'hid_drop' in params: self.hid_drop = params['hid_drop'] else: self.hid_drop = 0.0 if 'input_noise' in params: self.input_noise = params['input_noise'] else: self.input_noise = 0.0 if 'bias_noise' in params: self.bias_noise = params['bias_noise'] else: self.bias_noise = 0.0 if 'init_scale' in params: self.init_scale = params['init_scale'] else: self.init_scale = 1.0 if 'encoder' in params: self.encoder = params['encoder'] self.decoder = params['decoder'] self.use_encoder = True self.Xd_encoded = self.encoder(self.Xd) else: self.encoder = lambda x: x self.decoder = lambda x: x self.use_encoder = False self.Xd_encoded = self.encoder(self.Xd) if 'kld2_scale' in params: self.kld2_scale = params['kld2_scale'] else: self.kld2_scale = 0.0 if 'sigma_init_scale' in params: self.sigma_init_scale = params['sigma_init_scale'] else: self.sigma_init_scale = 1.0 # Check if the params for this net were given a priori. This option # will be used for creating "clones" of an inference network, with all # of the network parameters shared between clones. if shared_param_dicts is None: # This is not a clone, and we will need to make a dict for # referring to the parameters of each network layer self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []} self.is_clone = False else: # This is a clone, and its layer parameters can be found by # referring to the given param dict (i.e. shared_param_dicts). self.shared_param_dicts = shared_param_dicts self.is_clone = True # Get the configuration/prototype for this network. The config is a # list of layer descriptions, including a description for the input # layer, which is typically just the dimension of the inputs. So, the # depth of the mlp is one less than the number of layer configs. self.shared_config = params['shared_config'] self.mu_config = params['mu_config'] self.sigma_config = params['sigma_config'] if 'activation' in params: self.activation = params['activation'] else: self.activation = relu_actfun ######################################### # Initialize the shared part of network # ######################################### self.shared_layers = [] layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:]) layer_num = 0 # Construct input to the inference network if self.use_encoder: next_input = self.encoder(self.Xd) else: next_input = self.Xd for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "share_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer if first_layer: d_rate = self.vis_drop else: d_rate = self.hid_drop if first_layer: i_noise = self.input_noise b_noise = 0.0 else: i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) self.shared_param_dicts['shared'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['shared'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.shared_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.shared_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ##################################### # Initialize the mu part of network # ##################################### self.mu_layers = [] layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "mu_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) self.shared_param_dicts['mu'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['mu'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.mu_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.mu_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 ######################################## # Initialize the sigma part of network # ######################################## self.sigma_layers = [] layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:]) layer_num = 0 # Take input from the output of the shared network next_input = self.shared_layers[-1].output for in_def, out_def in layer_def_pairs: first_layer = (layer_num == 0) last_layer = (layer_num == (len(layer_def_pairs) - 1)) l_name = "sigma_layer_{0:d}".format(layer_num) if (type(in_def) is list) or (type(in_def) is tuple): # Receiving input from a poolish layer... in_dim = in_def[0] else: # Receiving input from a normal layer... in_dim = in_def if (type(out_def) is list) or (type(out_def) is tuple): # Applying some sort of pooling in this layer... out_dim = out_def[0] pool_size = out_def[1] else: # Not applying any pooling in this layer... out_dim = out_def pool_size = 0 # Select the appropriate noise to add to this layer d_rate = self.hid_drop i_noise = 0.0 b_noise = self.bias_noise # set in-bound weights to have norm self.init_scale i_scale = self.init_scale if last_layer: # set in-bound weights for logvar predictions to 0 i_scale = 0.0 * i_scale if not self.is_clone: ########################################## # Initialize a layer with new parameters # ########################################## new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) self.shared_param_dicts['sigma'].append( \ {'W': new_layer.W, 'b': new_layer.b, \ 'b_in': new_layer.b_in, 's_in': new_layer.s_in}) else: ################################################## # Initialize a layer with some shared parameters # ################################################## init_params = self.shared_param_dicts['sigma'][layer_num] if not (('b_in' in init_params) and ('s_in' in init_params)): init_params['b_in'] = None init_params['s_in'] = None new_layer = HiddenLayer(rng=rng, input=next_input, \ activation=self.activation, pool_size=pool_size, \ drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \ in_dim=in_dim, out_dim=out_dim, \ W=init_params['W'], b=init_params['b'], \ b_in=init_params['b_in'], s_in=init_params['s_in'], \ name=l_name, W_scale=i_scale) self.sigma_layers.append(new_layer) if ((init_params['b_in'] is None) or (init_params['s_in'] is None)): init_params['b_in'] = new_layer.b_in init_params['s_in'] = new_layer.s_in next_input = self.sigma_layers[-1].output # Acknowledge layer completion layer_num = layer_num + 1 # Create a shared parameter for rescaling posterior "sigmas" to allow # control over the velocity of the markov chain generated by repeated # cycling through the INF -> GEN loop. if not ('sigma_scale' in self.shared_param_dicts['sigma'][-1]): # we use a hack-ish check to remain compatible with loading models # that were saved before the addition of the sigma_scale param. zero_ary = np.zeros((1,)).astype(theano.config.floatX) self.sigma_scale = theano.shared(value=zero_ary) new_dict = {'sigma_scale': self.sigma_scale} self.shared_param_dicts['sigma'].append(new_dict) self.set_sigma_scale(1.0) else: # this is a clone of some other InfNet, and that InfNet was made # after adding the sigma_scale param, so use its sigma_scale self.sigma_scale = \ self.shared_param_dicts['sigma'][-1]['sigma_scale'] # Create a shared parameter for maintaining an exponentially decaying # estimate of the population mean of posterior KL divergence. if not ('kld_mean' in self.shared_param_dicts['sigma'][-1]): # add a kld_mean if none was already present zero_ary = np.zeros((1,)).astype(theano.config.floatX) + 100.0 self.kld_mean = theano.shared(value=zero_ary) self.shared_param_dicts['sigma'][-1]['kld_mean'] = self.kld_mean else: # use a kld_mean that's already present self.kld_mean = self.shared_param_dicts['sigma'][-1]['kld_mean'] # Mash all the parameters together, into a list. self.mlp_params = [] for layer in self.shared_layers: self.mlp_params.extend(layer.params) for layer in self.mu_layers: self.mlp_params.extend(layer.params) for layer in self.sigma_layers: self.mlp_params.extend(layer.params) # The output of this inference network is given by the noisy output # of the final layers of its mu and sigma networks. self.output_mean = self.mu_layers[-1].linear_output self.output_logvar = self.sigma_layers[-1].linear_output self.output_sigma = self.sigma_init_scale * self.sigma_scale[0] * \ T.exp(0.5 * self.output_logvar) # We'll also construct an output containing a single samples from each # of the distributions represented by the rows of self.output_mean and # self.output_sigma. self.output = self._construct_post_samples() self.out_dim = self.sigma_layers[-1].out_dim # Get simple regularization penalty to moderate activation dynamics self.act_reg_cost = self.lam_l2a * self._act_reg_cost() # Construct a function for penalizing KL divergence between the # approximate posteriors produced by this model and some isotropic # Gaussian distribution. self.kld_cost = self._construct_kld_cost() self.kld_mean_update = T.cast((0.98 * self.kld_mean) + \ (0.02 * T.mean(self.kld_cost)), 'floatX') # Construct a theano function for sampling from the approximate # posteriors inferred by this model for some collection of points # in the "data space". if self.build_theano_funcs: self.sample_posterior = self._construct_sample_posterior() self.mean_posterior = theano.function([self.Xd], \ outputs=self.output_mean) else: self.sample_posterior = None self.mean_posterior = None return
def __init__(self, n_dim, n_out, n_chan=1, n_superbatch=12800, opt_alg='adam', opt_params={ 'lr': 1e-3, 'b1': 0.9, 'b2': 0.99 }): self.numpy_rng = np.random.RandomState(1234) self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30)) self.n_dim = n_dim self.n_out = n_out self.n_superbatch = n_superbatch self.alg = opt_alg self.n_class = 10 lr = opt_params.get('lr') n_batch = opt_params.get('nb') train_set_x = theano.shared( np.empty((n_superbatch, n_chan, n_dim, n_dim), dtype=theano.config.floatX), borrow=False, ) val_set_x = theano.shared( np.empty((n_superbatch, n_chan, n_dim, n_dim), dtype=theano.config.floatX), borrow=False, ) train_set_y = theano.shared( np.empty((n_superbatch, ), dtype=theano.config.floatX), borrow=False, ) val_set_y = theano.shared( np.empty((n_superbatch, ), dtype=theano.config.floatX), borrow=False, ) train_set_y_int = T.cast(train_set_y, 'int32') val_set_y_int = T.cast(val_set_y, 'int32') train_rbm_px_mu = theano.shared( np.empty((n_superbatch, self.n_aux), dtype=theano.config.floatX), borrow=False, ) X = T.tensor4(dtype=theano.config.floatX) S = T.tensor3(dtype=theano.config.floatX) Y = T.ivector() px_mu = T.lscalar(dtype=config.floatX) idx1, idx2 = T.lscalar(), T.lscalar() alpha = T.scalar(dtype=theano.config.floatX) # learning rate self.inputs = (X, Y, idx1, idx2, S, px_mu) # ---------------------------- # Begin RBM-only self.rbm_network = self.create_rbm_model(n_dim, n_out, n_chan) persistent_chain = theano.shared( np.zeros((n_batch, self.n_hidden), dtype=theano.config.floatX), borrow=True, ) rbm_cost, rbm_acc, rbm_updates = self.get_rbm_objective_and_updates( alpha, lr=lr, persistent=persistent_chain, ) self.rbm_objectives = (rbm_cost, rbm_acc) self.rbm_train = theano.function( [idx1, idx2, alpha], [rbm_cost, rbm_acc], updates=rbm_updates, givens={ X: train_set_x[idx1:idx2], Y: train_set_y_int[idx1:idx2] }, on_unused_input='warn', ) # End RBM-only # ---------------------------- # Begin DADGM-only tau = theano.shared( np.float32(5.0), name='temperature', allow_downcast=True, borrow=False, ) self.tau = tau self.dadgm_network = self.create_dadgm_model( X, Y, n_dim, n_out, n_chan, ) dadgm_loss, dadgm_acc = self.create_dadgm_objectives(False) self.dadgm_objectives = (dadgm_loss, dadgm_acc) dadgm_params = self.get_dadgm_params() dadgm_grads = self.create_dadgm_gradients(dadgm_loss, False) dadgm_updates = self.create_dadgm_updates( dadgm_grads, dadgm_params, alpha, opt_alg, opt_params, ) self.dadgm_train = theano.function( [idx1, idx2, alpha], [dadgm_loss, dadgm_acc], updates=dadgm_updates, givens={ X: train_set_x[idx1:idx2], Y: train_set_y_int[idx1:idx2], px_mu: train_rbm_px_mu, }, on_unused_input='warn', ) self.dadgm_loss = theano.function( [X, Y], [dadgm_loss, dadgm_acc], on_unused_input='warn', ) # End DADGM-only # ---------------------------- self.n_batch = n_batch # parameters for sampling self.n_chain = 100 # save data variables self.train_set_x = train_set_x self.train_set_y = train_set_y self.val_set_x = val_set_x self.val_set_y = val_set_y self.train_rbm_px_mu = train_rbm_px_mu self.data_loaded = False
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, nkerns=[256, 256], batch_size=1, window_width=[4, 4], maxSentLength=64, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.0006, Div_reg=0.06, update_freq=1, norm_threshold=5.0, max_truncate=40): maxSentLength = max_truncate + 2 * (window_width[0] - 1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/' rng = numpy.random.RandomState(23455) datasets, vocab_size = load_wikiQA_corpus( rootPath + 'vocab.txt', rootPath + 'WikiQA-train.txt', rootPath + 'test_filtered.txt', max_truncate, maxSentLength) #vocab_size contain train, dev and test #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test mtPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' mt_train, mt_test = load_mts_wikiQA( mtPath + 'result_train/concate_2mt_train.txt', mtPath + 'result_test/concate_2mt_test.txt') wm_train, wm_test = load_wmf_wikiQA( rootPath + 'train_word_matching_scores.txt', rootPath + 'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[ 0] indices_train_l = indices_train[::2, :] indices_train_r = indices_train[1::2, :] trainLengths_l = trainLengths[::2] trainLengths_r = trainLengths[1::2] normalized_train_length_l = normalized_train_length[::2] normalized_train_length_r = normalized_train_length[1::2] trainLeftPad_l = trainLeftPad[::2] trainLeftPad_r = trainLeftPad[1::2] trainRightPad_l = trainRightPad[::2] trainRightPad_r = trainRightPad[1::2] indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[ 1] indices_test_l = indices_test[::2, :] indices_test_r = indices_test[1::2, :] testLengths_l = testLengths[::2] testLengths_r = testLengths[1::2] normalized_test_length_l = normalized_test_length[::2] normalized_test_length_r = normalized_test_length[1::2] testLeftPad_l = testLeftPad[::2] testLeftPad_r = testLeftPad[1::2] testRightPad_l = testRightPad[::2] testRightPad_r = testRightPad[1::2] n_train_batches = indices_train_l.shape[0] / batch_size n_test_batches = indices_test_l.shape[0] / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) indices_train_l = theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r = theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l = theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r = theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l = T.cast(indices_train_l, 'int64') indices_train_r = T.cast(indices_train_r, 'int64') indices_test_l = T.cast(indices_test_l, 'int64') indices_test_r = T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, rootPath + 'vocab_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix( 'x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l = T.lscalar() right_l = T.lscalar() left_r = T.lscalar() right_r = T.lscalar() length_l = T.lscalar() length_r = T.lscalar() norm_length_l = T.dscalar() norm_length_r = T.dscalar() mts = T.dmatrix() wmf = T.dmatrix() cost_tmp = T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size = (emb_size, window_width[0]) filter_size_2 = (nkerns[0], window_width[1]) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv = ishape[1] + filter_size[1] - 1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape( (maxSentLength, emb_size)).transpose() layer0_r_input = embeddings[x_index_r.flatten()].reshape( (maxSentLength, emb_size)).transpose() l_input_tensor = debug_print( Matrix_Bit_Shift(layer0_l_input[:, left_l:-right_l]), 'l_input_tensor') r_input_tensor = debug_print( Matrix_Bit_Shift(layer0_r_input[:, left_r:-right_r]), 'r_input_tensor') addition_l = T.sum(layer0_l_input[:, left_l:-right_l], axis=1) addition_r = T.sum(layer0_r_input[:, left_r:-right_r], axis=1) cosine_addition = cosine(addition_l, addition_r) eucli_addition = 1.0 / (1.0 + EUCLID(addition_l, addition_r)) #25.2% U, W, b = create_GRU_para(rng, emb_size, nkerns[0]) layer0_para = [U, W, b] layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0], U=U, W=W, b=b, bptt_truncate=-1) layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0], U=U, W=W, b=b, bptt_truncate=-1) cosine_sent = cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep) eucli_sent = 1.0 / (1.0 + EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)) #25.2% #ibm attentive pooling at extended sentence level attention_matrix = compute_simi_feature_matrix_with_matrix( layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength * (maxSentLength + 1) / 2) # attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose() # ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose() # attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose() # ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose() # cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended) # eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2% #ibm attentive pooling at original sentence level simi_matrix_sent = compute_simi_feature_matrix_with_matrix( layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates, length_l, length_r, maxSentLength) attention_vec_l = T.nnet.softmax(T.max(simi_matrix_sent, axis=1)).transpose() ibm_l = layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose() attention_vec_r = T.nnet.softmax(T.max(simi_matrix_sent, axis=0)).transpose() ibm_r = layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose() cosine_ibm = cosine(ibm_l, ibm_r) eucli_ibm = 1.0 / (1.0 + EUCLID(ibm_l, ibm_r)) #25.2% l_max_attention = T.max(attention_matrix, axis=1) neighborsArgSorted = T.argsort(l_max_attention) kNeighborsArg = neighborsArgSorted[-3:] #only average the max 3 vectors ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie r_max_attention = T.max(attention_matrix, axis=0) neighborsArgSorted_r = T.argsort(r_max_attention) kNeighborsArg_r = neighborsArgSorted_r[ -3:] #only average the max 3 vectors rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie l_max_min_attention = debug_print(layer0_A1.output_matrix[:, ll], 'l_max_min_attention') r_max_min_attention = debug_print(layer0_A2.output_matrix[:, rr], 'r_max_min_attention') U1, W1, b1 = create_GRU_para(rng, nkerns[0], nkerns[1]) layer1_para = [U1, W1, b1] layer1_A1 = GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1], U=U1, W=W1, b=b1, bptt_truncate=-1) layer1_A2 = GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1], U=U1, W=W1, b=b1, bptt_truncate=-1) vec_l = debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l') vec_r = debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r') # sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) # aver_uni_l=sum_uni_l/layer0_l_input.shape[3] # norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) # sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) # aver_uni_r=sum_uni_r/layer0_r_input.shape[3] # norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) # uni_cosine = cosine(vec_l, vec_r) # aver_uni_cosine=cosine(aver_uni_l, aver_uni_r) # uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi') # ''' # linear=Linear(sum_uni_l, sum_uni_r) # poly=Poly(sum_uni_l, sum_uni_r) # sigmoid=Sigmoid(sum_uni_l, sum_uni_r) # rbf=RBF(sum_uni_l, sum_uni_r) # gesd=GESD(sum_uni_l, sum_uni_r) # ''' eucli_1 = 1.0 / (1.0 + EUCLID(vec_l, vec_r)) #25.2% # #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) # len_l = norm_length_l.reshape((1, 1)) len_r = norm_length_r.reshape((1, 1)) # # ''' # len_l=length_l.reshape((1,1)) # len_r=length_r.reshape((1,1)) # ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input = T.concatenate( [ vec_l, vec_r, uni_cosine, eucli_1, cosine_addition, eucli_addition, # cosine_sent, eucli_sent, ibm_l.reshape((1, nkerns[0])), ibm_r.reshape((1, nkerns[0])), #2*nkerns[0]+ cosine_ibm, eucli_ibm, len_l, len_r, wmf ], axis=1) #, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3 = LogisticRegression(rng, input=layer3_input, n_in=(2 * nkerns[1] + 2) + 2 + (2 * nkerns[0] + 2) + 2 + 2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg = debug_print( (layer3.W**2).sum() + (U**2).sum() + (W**2).sum() + (U1**2).sum() + (W1**2).sum(), 'L2_reg' ) #+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum() diversify_reg = Diversify_Reg(layer3.W.T) + Diversify_Reg( U[0]) + Diversify_Reg(W[0]) + Diversify_Reg(U1[0]) + Diversify_Reg( W1[0]) + Diversify_Reg(U[1]) + Diversify_Reg(W[1]) + Diversify_Reg( U1[1]) + Diversify_Reg(W1[1]) + Diversify_Reg( U[2]) + Diversify_Reg(W[2]) + Diversify_Reg( U1[2]) + Diversify_Reg(W1[2]) cost_this = debug_print(layer3.negative_log_likelihood(y), 'cost_this') #+L2_weight*L2_reg cost = debug_print((cost_this + cost_tmp) / update_freq + L2_weight * L2_reg + Div_reg * diversify_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [layer3.prop_for_posi, layer3_input, y], givens={ x_index_l: indices_test_l[index:index + batch_size], x_index_r: indices_test_r[index:index + batch_size], y: testY[index:index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index:index + batch_size], wmf: wm_test[index:index + batch_size] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params + layer1_para + layer0_para #+[embeddings]# + layer1.params # params_conv = [conv_W, conv_b] # accumulator=[] # for para_i in params: # eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) # accumulator.append(theano.shared(eps_p, borrow=True)) # # # create a list of gradients for all model parameters # grads = T.grad(cost, params) # # updates = [] # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # grad_i=debug_print(grad_i,'grad_i') # acc = acc_i + T.sqr(grad_i) # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad # updates.append((acc_i, acc)) def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): updates = [] grads = T.grad(cost, params) i = theano.shared(numpy.float64(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates updates = Adam(cost=cost, params=params, lr=learning_rate) train_model = theano.function( [index, cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size], wmf: wm_train[index:index + batch_size] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost_this, layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size], wmf: wm_train[index:index + batch_size] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time epoch = 0 done_looping = False svm_max = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 #shuffle(train_batch_start)#shuffle training data cost_tmp = 0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) # print batch_start if iter % update_freq != 0: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start) #print 'layer3_input', layer3_input cost_tmp += cost_ij error_sum += error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average = train_model(batch_start, cost_tmp) #print 'layer3_input', layer3_input error_sum = 0 cost_tmp = 0.0 #reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + ' error: ' + str( error_sum) + '/' + str( update_freq) + ' error rate: ' + str( error_sum * 1.0 / update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_probs = [] test_y = [] test_features = [] for i in test_batch_start: prob_i, layer3_input, y = test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_probs.append(prob_i[0][0]) test_y.append(y[0]) test_features.append(layer3_input[0]) MAP, MRR = compute_map_mrr(rootPath + 'test_filtered.txt', test_probs) #now, check MAP and MRR print( ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best ' 'model %f, MRR %f') % (epoch, minibatch_index, n_train_batches, MAP, MRR)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y = [] train_features = [] count = 0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(C=1.0, kernel='linear') clf.fit(train_features, train_y) results_svm = clf.decision_function(test_features) MAP_svm, MRR_svm = compute_map_mrr( rootPath + 'test_filtered.txt', results_svm) lr = LinearRegression().fit(train_features, train_y) results_lr = lr.predict(test_features) MAP_lr, MRR_lr = compute_map_mrr( rootPath + 'test_filtered.txt', results_lr) print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def create_objectives(self, deterministic=False): # load network input X = self.inputs[0] x = X.flatten(2) # duplicate entries to take into account multiple mc samples n_sam = self.n_sample n_out = x.shape[1] x = x.dimshuffle(0, 'x', 1).repeat(n_sam, axis=1).reshape((-1, n_out)) # load network l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \ l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, \ l_qa, l_qz = self.network l_qa_in, l_px_in = self.input_layers # load network output qz_mu, qz_logsigma, qa_mu, qa_logsigma, a, z \ = lasagne.layers.get_output( [l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, l_qa, l_qz], deterministic=deterministic, ) pa_mu, pa_logsigma = lasagne.layers.get_output( [l_pa_mu, l_pa_logsigma], {l_px_in: z}, deterministic=deterministic, ) if self.model == 'bernoulli': px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in: z}, deterministic=deterministic) elif self.model == 'gaussian': px_mu, px_logsigma = lasagne.layers.get_output( [l_px_mu, l_px_logsigma], {l_px_in: z}, deterministic=deterministic, ) # entropy term log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1) log_qz_given_ax = log_normal2(z, qz_mu, qz_logsigma).sum(axis=1) log_qza_given_x = log_qz_given_ax + log_qa_given_x # log-probability term z_prior_sigma = T.cast(T.ones_like(qz_logsigma), dtype=theano.config.floatX) z_prior_mu = T.cast(T.zeros_like(qz_mu), dtype=theano.config.floatX) log_pz = log_normal(z, z_prior_mu, z_prior_sigma).sum(axis=1) log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1) if self.model == 'bernoulli': log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1) elif self.model == 'gaussian': log_px_given_z = log_normal2(x, px_mu, px_logsigma).sum(axis=1) log_paxz = log_pa_given_z + log_px_given_z + log_pz # compute the evidence lower bound elbo = T.mean(log_paxz - log_qza_given_x) # we don't use a spearate accuracy metric right now return -elbo, T.mean(qz_logsigma)
generator_params = {} random_seed = 1234 rng = np.random.RandomState(random_seed) srng = theano.tensor.shared_randomstreams.RandomStreams(rng.randint(999999)) #using 400/1200/10 num_hidden_discriminator = 400 num_hidden_generator = 1200 var_dimensionality = 200 #using 0.01 scale_disc = 0.05 scale_gen = 0.05 castx = lambda x: T.cast(x, theano.config.floatX) discriminator_params["W1_d"] = theano.shared(np.asarray(1.0 * np.random.uniform(-1.0 * scale_disc, 1.0 * scale_disc, (1, num_hidden_discriminator * 5)), dtype = theano.config.floatX)) discriminator_params["b1_d"] = theano.shared(np.asarray(0.0 + 0.0 * np.random.normal(0, 1, (5 * num_hidden_discriminator,)), dtype = theano.config.floatX)) discriminator_params["W2_d"] = theano.shared(np.asarray(1.0 * np.random.uniform(-1.0 * scale_disc, 1.0 * scale_disc, (num_hidden_discriminator, num_hidden_discriminator * 5)), dtype = theano.config.floatX)) discriminator_params["b2_d"] = theano.shared(np.asarray(0.0 + 0.0 * np.random.normal(0, 0.1, (5 * num_hidden_discriminator,)), dtype = theano.config.floatX)) discriminator_params["W3_d"] = theano.shared(np.asarray(1.0 * np.random.uniform(-1.0 * scale_disc, 1.0 * scale_disc, (num_hidden_discriminator, 1)), dtype = theano.config.floatX)) discriminator_params["b3_d"] = theano.shared(np.asarray(0.0 * np.random.normal(0, 0.1, (1,)), dtype = theano.config.floatX)) generator_params["W1_g"] = theano.shared(np.asarray(1.0 * np.random.uniform(-1.0 * scale_gen, 1.0 * scale_gen, (var_dimensionality, num_hidden_generator)), dtype = theano.config.floatX), name = "W1_g") generator_params["b1_g"] = theano.shared(np.asarray(0.0 + 0.0 * np.random.normal(0, 1, (num_hidden_generator,)), dtype = theano.config.floatX)) generator_params["W2_g"] = theano.shared(np.asarray(1.0 * np.random.uniform(-1.0 * scale_gen, 1.0 * scale_gen, (num_hidden_generator, num_hidden_generator)), dtype = theano.config.floatX)) generator_params["b2_g"] = theano.shared(np.asarray(0.0 + 0.0 * np.random.normal(0, 1, (num_hidden_generator,)), dtype = theano.config.floatX))
def recon_err_(self, v_in): return T.sum((self.recon_(v_in) - v_in)**2) / T.cast(v_in.shape[0], fx)
def get_cost_updates(self, lr=0.1, persistent=None, k=1): """This functions implements one step of CD-k or PCD-k :param lr: learning rate used to train the RBM :param persistent: None for CD. For PCD, shared variable containing old state of Gibbs chain. This must be a shared variable of size (batch size, number of hidden units). :param k: number of Gibbs steps to do in CD-k/PCD-k Returns a proxy for the cost and the updates dictionary. The dictionary contains the update rules for weights and biases but also an update of the shared variable used to store the persistent chain, if one is used. """ # compute positive phase pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input) # decide how to initialize persistent chain: # for CD, we use the newly generate hidden sample # for PCD, we initialize from the old state of the chain if persistent is None: chain_start = ph_sample else: chain_start = persistent # end-snippet-2 # perform actual negative phase # in order to implement CD-k/PCD-k we need to scan over the # function that implements one gibbs step k times. # Read Theano tutorial on scan for more information : # http://deeplearning.net/software/theano/library/scan.html # the scan will return the entire Gibbs chain ([ pre_sigmoid_nvs, nv_means, nv_samples, pre_sigmoid_nhs, nh_means, nh_samples ], updates) = theano.scan( self.gibbs_hvh, # the None are place holders, saying that # chain_start is the initial state corresponding to the # 6th output outputs_info=[None, None, None, None, None, chain_start], n_steps=k, name="gibbs_hvh") # start-snippet-3 # determine gradients on RBM parameters # note that we only need the sample at the end of the chain chain_end = nv_samples[-1] cost = T.mean(self.free_energy(self.input)) - T.mean( self.free_energy(chain_end)) # We must not compute the gradient through the gibbs sampling gparams = T.grad(cost, self.params, consider_constant=[chain_end]) # end-snippet-3 start-snippet-4 # constructs the update dictionary for gparam, param in zip(gparams, self.params): # make sure that the learning rate is of the right dtype updates[param] = param - gparam * T.cast( lr, dtype=theano.config.floatX) if persistent: # Note that this works only if persistent is a shared variable updates[persistent] = nh_samples[-1] # pseudo-likelihood is a better proxy for PCD monitoring_cost = self.get_pseudo_likelihood_cost(updates) else: # reconstruction cross-entropy is a better proxy for CD monitoring_cost = self.get_reconstruction_cost( updates, pre_sigmoid_nvs[-1]) return monitoring_cost, updates
def train_conv_net(datasets, U, lr_decay=0.95, img_w=300, filter_hs=[3, 4, 5], conv_non_linear="relu", hidden_units=[100, 3], shuffle_batch=True, n_epochs=25, sqr_norm_lim=9, non_static=True, batch_size=50, activations=[Iden], dropout_rate=[0.5]): """ Train a simple conv net img_h = sentence length (padded where necessary) img_w = word vector length (300 for word2vec) filter_hs = filter window sizes hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer sqr_norm_lim = s^2 in the paper lr_decay = adadelta decay parameter """ rng = np.random.RandomState(3435) img_h = len(datasets[0][0]) - 1 filter_w = img_w feature_maps = hidden_units[0] filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) parameters = [("image shape", img_h, img_w), ("filter shape", filter_shapes), ("hidden_units", hidden_units), ("dropout", dropout_rate), ("batch_size", batch_size), ("non_static", non_static), ("learn_decay", lr_decay), ("conv_non_linear", conv_non_linear), ("non_static", non_static), ("sqr_norm_lim", sqr_norm_lim), ("shuffle_batch", shuffle_batch)] print parameters #define model architecture index = T.lscalar() x = T.matrix('x') y = T.ivector('y') Words = theano.shared(value=U, name="Words") zero_vec_tensor = T.vector() zero_vec = np.zeros(img_w) set_zero = theano.function([zero_vec_tensor], updates=[ (Words, T.set_subtensor(Words[0, :], zero_vec_tensor)) ], allow_input_downcast=True) layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0], 1, x.shape[1], Words.shape[1])) conv_layers = [] layer1_inputs = [] print 'starting loop' for i in xrange(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_h, img_w), filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) layer1_input = conv_layer.output.flatten(2) conv_layers.append(conv_layer) layer1_inputs.append(layer1_input) layer1_input = T.concatenate(layer1_inputs, 1) hidden_units[0] = feature_maps * len(filter_hs) classifier = MLPDropout(rng, input=layer1_input, layer_sizes=hidden_units, activations=activations, dropout_rates=dropout_rate) print 'defining params' #define parameters of the model and update functions using adadelta params = classifier.params for conv_layer in conv_layers: params += conv_layer.params if non_static: #if word vectors are allowed to change, add them as model parameters params += [Words] cost = classifier.negative_log_likelihood(y) dropout_cost = classifier.dropout_negative_log_likelihood(y) grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6, sqr_norm_lim) #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate #extra data (at random) np.random.seed(3435) if datasets[0].shape[0] % batch_size > 0: extra_data_num = batch_size - datasets[0].shape[0] % batch_size train_set = np.random.permutation(datasets[0]) extra_data = train_set[:extra_data_num] new_data = np.append(datasets[0], extra_data, axis=0) else: new_data = datasets[0] new_data = np.random.permutation(new_data) n_batches = new_data.shape[0] / batch_size n_train_batches = int(np.round(n_batches * 0.9)) #divide train set into train/val sets test_set_x = datasets[1][:, :img_h] test_set_y = np.asarray(datasets[1][:, -1], "int32") train_set = new_data[:n_train_batches * batch_size, :] val_set = new_data[n_train_batches * batch_size:, :] train_set_x, train_set_y = shared_dataset( (train_set[:, :img_h], train_set[:, -1])) val_set_x, val_set_y = shared_dataset((val_set[:, :img_h], val_set[:, -1])) n_val_batches = n_batches - n_train_batches val_model = theano.function( [index], classifier.errors(y), givens={ x: val_set_x[index * batch_size:(index + 1) * batch_size], y: val_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) #compile theano functions to get train/val/test errors test_model = theano.function( [index], classifier.errors(y), givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) train_model = theano.function( [index], cost, updates=grad_updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }, allow_input_downcast=True) test_pred_layers = [] test_size = test_set_x.shape[0] test_layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape( (test_size, 1, img_h, Words.shape[1])) for conv_layer in conv_layers: test_layer0_output = conv_layer.predict(test_layer0_input, test_size) test_pred_layers.append(test_layer0_output.flatten(2)) test_layer1_input = T.concatenate(test_pred_layers, 1) test_y_pred = classifier.predict(test_layer1_input) test_error = T.mean(T.neq(test_y_pred, y)) test_model_all = theano.function([x, y], test_error, allow_input_downcast=True) #start training over mini-batches print 'sizes: ' print 'test: ' print test_size print '... training' print 'n_train_batches: ' + str(n_train_batches) epoch = 0 best_val_perf = 0 val_perf = 0 test_perf = 0 cost_epoch = 0 while (epoch < n_epochs): print 'epoch: ' + str(epoch) start_time = time.time() epoch = epoch + 1 if shuffle_batch: for minibatch_index in np.random.permutation( range(n_train_batches)): if minibatch_index >= n_train_batches: minibatch_index -= 1 print 'if: minibatch_index: ' + str(minibatch_index) cost_epoch = train_model(minibatch_index) set_zero(zero_vec) else: for minibatch_index in xrange(n_train_batches): if minibatch_index >= n_train_batches: minibatch_index -= 1 print 'else: minibatch_index: ' + str(minibatch_index) cost_epoch = train_model(minibatch_index) set_zero(zero_vec) train_losses = [test_model(i) for i in xrange(n_train_batches)] train_perf = 1 - np.mean(train_losses) val_losses = [val_model(i) for i in xrange(n_val_batches)] val_perf = 1 - np.mean(val_losses) print( 'epoch: %i, training time: %.2f secs, train perf: %.2f %%, val perf: %.2f %%' % (epoch, time.time() - start_time, train_perf * 100., val_perf * 100.)) if val_perf >= best_val_perf: best_val_perf = val_perf test_loss = test_model_all(test_set_x, test_set_y) test_perf = 1 - test_loss return test_perf
def run_cnn(exp_name, dataset, embedding, log_fn, perf_fn, emb_dm=100, batch_size=100, filter_hs=[1, 2, 3], hidden_units=[200, 100, 11], dropout_rate=0.5, shuffle_batch=True, n_epochs=300, lr_decay=0.95, activation=ReLU, sqr_norm_lim=9, non_static=True): """ Train and Evaluate CNN event encoder model :dataset: list containing three elements[(train_x, train_y), (valid_x, valid_y), (test_x, test_y)] :embedding: word embedding with shape (|V| * emb_dm) :filter_hs: filter height for each paralle cnn layer :dropout_rate: dropout rate for full connected layers :n_epochs: the max number of iterations """ start_time = timeit.default_timer() rng = np.random.RandomState(1234) input_height = len(dataset[0][0][0][0]) num_sens = len(dataset[0][0][0]) print "--input height ", input_height input_width = emb_dm num_maps = hidden_units[0] ################### # start snippet 1 # ################### print "start to construct the model ...." x = T.tensor3("x") y = T.matrix("y") words = shared(value=np.asarray(embedding, dtype=theano.config.floatX), name="embedding", borrow=True) # define function to keep padding vector as zero zero_vector_tensor = T.vector() zero_vec = np.zeros(input_width, dtype=theano.config.floatX) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm)) conv_layers = [] layer1_inputs = [] for i in xrange(len(filter_hs)): filter_shape = (num_maps, 1, filter_hs[i], emb_dm) pool_size = (input_height - filter_hs[i] + 1, 1) conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=None, filter_shape=filter_shape, pool_size=pool_size, activation=activation) sen_vecs = conv_layer.output.reshape( (x.shape[0], x.shape[1], num_maps)) sen_vecs = sen_vecs.dimshuffle(0, 2, 1) doc_vec = T.sum(sen_vecs, axis=2).flatten(2) layer1_inputs.append(doc_vec) conv_layers.append(conv_layer) layer1_input = T.concatenate(layer1_inputs, 1) ############## # Task pop# ############## print "Construct classifier ...." hidden_units[0] = num_maps * len(filter_hs) pop_factor = nn.MLDropout( rng, input=layer1_input, layer_sizes=hidden_units, dropout_rates=[dropout_rate for i in range(len(hidden_units) - 1)], activations=[activation for i in range(len(hidden_units) - 1)]) pop_factor_output = pop_factor.output.dimshuffle(0, 1, 'x') pop_factor_dropout_output = pop_factor.dropout_output.dimshuffle(0, 1, 'x') ####################### # Task Type ##### ####################### type_hidden_units = [num for num in hidden_units] type_hidden_units[-1] = 5 type_factor = nn.MLDropout( rng, input=layer1_input, layer_sizes=type_hidden_units, dropout_rates=[ dropout_rate for i in range(len(type_hidden_units) - 1) ], activations=[activation for i in range(len(type_hidden_units) - 1)]) type_factor_output = type_factor.output.dimshuffle(0, 'x', 1) type_factor_dropout_output = type_factor.dropout_output.dimshuffle( 0, 'x', 1) ###################### ## Joint Y matrix ### ##################### # construct V matrix to model pop type dependency V_value = np.random.random((hidden_units[-1], type_hidden_units[-1])) V = theano.shared(value=np.asarray(V_value, dtype=theano.config.floatX), name="V", borrow=True) # compute the Joint propability joint_act = T.batched_dot(pop_factor_output, type_factor_output) + V joint_act_dropout = T.batched_dot(pop_factor_dropout_output, type_factor_dropout_output) + V joint_probs = T.nnet.softmax(joint_act.flatten(2)) joint_probs_dropout = T.nnet.softmax(joint_act_dropout.flatten(2)) neg_likelihood = -T.mean(T.log(T.sum(joint_probs * y, axis=1))) neg_likelihood_dropout = -T.mean( T.log(T.sum(joint_probs_dropout * y, axis=1))) joint_preds = T.argmax(joint_probs, axis=1) pop_preds = joint_preds // type_hidden_units[-1] type_preds = joint_preds % type_hidden_units[-1] y_index = T.argmax(y, axis=1) pop_y = y_index // type_hidden_units[-1] type_y = y_index % type_hidden_units[-1] pop_error = T.mean(T.neq(pop_preds, pop_y)) type_error = T.mean(T.neq(type_preds, type_y)) params = pop_factor.params params += type_factor.params params.append(V) for conv_layer in conv_layers: params += conv_layer.params if non_static: params.append(words) grad_updates = sgd_updates_adadelta(params, neg_likelihood_dropout, lr_decay, 1e-6, sqr_norm_lim) ##################### # Construct Dataset # ##################### print "Copy data to GPU and constrct train/valid/test func" np.random.seed(1234) train_x, train_y = shared_dataset(dataset[0]) test_x, test_y = shared_dataset(dataset[1]) n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) ##################### # Train model func # ##################### index = T.iscalar() train_func = function( [index], neg_likelihood_dropout, updates=grad_updates, givens={ x: train_x[index * batch_size:(index + 1) * batch_size], y: train_y[index * batch_size:(index + 1) * batch_size] }) test_pred = function( [index], [pop_error, type_error], givens={ x: test_x[index * batch_size:(index + 1) * batch_size], y: test_y[index * batch_size:(index + 1) * batch_size] }) # apply early stop strategy patience = 100 patience_increase = 2 improvement_threshold = 1.005 n_test = len(dataset[1][0]) epoch = 0 best_params = None best_validation_score = 0. test_perf = 0 done_loop = False log_file = open(log_fn, 'a') while (epoch < n_epochs) and not done_loop: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) if epoch % 5 == 0: # do test test_pop_errors = [] test_type_errors = [] for i in xrange(n_test_batches): test_pop_error, test_type_error = test_pred(i) test_pop_errors.append(test_pop_error) test_type_errors.append(test_type_error) test_pop_score = 1 - np.mean(test_pop_errors) test_type_score = 1 - np.mean(test_type_errors) message = "Epoch %d test pop perf %f, type perf %f" % ( epoch, test_pop_score, test_type_score) print message log_file.write(message + "\n") log_file.flush() end_time = timeit.default_timer() print "Finish one iteration using %f m" % ( (end_time - start_time) / 60.) log_file.flush() log_file.close()
def shared_dataset(data_xy): data_x, data_y = data_xy shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX)) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX)) return shared_x, T.cast(shared_y, 'int32')
def log_likelihood_samplesImean_sigma2(samples, mean, logvar): return c*T.cast(samples.shape[2], 'float32') /2 - \ T.sum(T.sqr((samples-mean)/T.exp(logvar)) + 2*logvar, axis=2) / 2
def train( dim_word=100, dim_word_src=200, enc_dim=1000, dec_dim=1000, # the number of LSTM units patience=-1, # early stopping patience max_epochs=5000, finish_after=-1, # finish after this many updates decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=0.01, # learning rate n_words_src=100000, # source vocabulary size n_words=100000, # target vocabulary size maxlen=1000, # maximum length of the description maxlen_trg=1000, # maximum length of the description maxlen_sample=1000, optimizer='rmsprop', batch_size=[1, 2, 3, 4], valid_batch_size=16, sort_size=20, save_path=None, save_file_name='model', save_best_models=0, dispFreq=100, validFreq=100, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=-1, pbatchFreq=-1, verboseFreq=10000, datasets=[ 'data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok' ], valid_datasets=[ '../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok' ], dictionaries=[ '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl', '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl' ], source_word_level=0, target_word_level=0, use_dropout=False, re_load=False, re_load_old_setting=False, uidx=None, eidx=None, cidx=None, layers=None, save_every_saveFreq=0, save_burn_in=20000, use_bpe=0, init_params=None, build_model=None, build_sampler=None, gen_sample=None, **kwargs): # Model options model_options = locals().copy() del model_options['init_params'] del model_options['build_model'] del model_options['build_sampler'] del model_options['gen_sample'] # load dictionaries and invert them # dictionaries[0] : src # dictionaries[1] : trg worddicts = [None] * len(dictionaries) worddicts_r = [None] * len(dictionaries) # ii, dd : 0 = source, 1 = target for ii, dd in enumerate(dictionaries): with open(dd, 'rb') as f: worddicts[ii] = cPickle.load(f) worddicts_r[ii] = dict() for kk, vv in worddicts[ii].iteritems(): worddicts_r[ii][vv] = kk print 'Building model' if not os.path.exists(save_path): os.makedirs(save_path) file_name = '%s%s.npz' % (save_path, save_file_name) best_file_name = '%s%s.best.npz' % (save_path, save_file_name) opt_file_name = '%s%s%s.npz' % (save_path, save_file_name, '.grads') best_opt_file_name = '%s%s%s.best.npz' % (save_path, save_file_name, '.grads') model_name = '%s%s.pkl' % (save_path, save_file_name) params = init_params(model_options) cPickle.dump(model_options, open(model_name, 'wb')) history_errs = [[], [], [], []] # reload options # reload : False if re_load and os.path.exists(file_name): print 'You are reloading your experiment.. do not panic dude..' if re_load_old_setting: with open(model_name, 'rb') as f: models_options = cPickle.load(f) params = load_params(file_name, params) # reload history model = numpy.load(file_name) history_errs = list(lst.tolist() for lst in model['history_errs']) if uidx is None: uidx = model['uidx'] if eidx is None: eidx = model['eidx'] if cidx is None: try: cidx = model['cidx'] except: cidx = 0 else: if uidx is None: uidx = 0 if eidx is None: eidx = 0 if cidx is None: cidx = 0 print 'Loading data' train = MultiTextIterator(source=datasets[0], target=datasets[1], source_dict=dictionaries[0], target_dict=dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, source_word_level=source_word_level, target_word_level=target_word_level, batch_size=batch_size, sort_size=sort_size) valid = [ TextIterator(source=valid_dataset[0], target=valid_dataset[1], source_dict=dictionaries[0], target_dict=dictionaries[1], n_words_source=n_words_src, n_words_target=n_words, source_word_level=source_word_level, target_word_level=target_word_level, batch_size=valid_batch_size, sort_size=sort_size) for valid_dataset in valid_datasets ] # create shared variables for parameters tparams = init_tparams(params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, model_options) # NOTE : this is where we build the model inps = [x, x_mask, y, y_mask] print 'Building sampler...\n', f_init, f_next = build_sampler(tparams, model_options, trng, use_noise) #print 'Done' # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) # NOTE : f_log_probs : [x, x_mask, y, y_mask], cost print 'Done' if re_load: # NOTE : this whole thing is False use_noise.set_value(0.) valid_scores = [] for ii, vv in enumerate(valid): valid_errs = pred_probs(f_log_probs, prepare_data, model_options, vv, verboseFreq=verboseFreq) valid_err = valid_errs.mean() if numpy.isnan(valid_err): import ipdb ipdb.set_trace() print 'Reload sanity check: Valid ', valid_err cost = cost.mean() # apply L2 regularization on weights # decay_c : 0 if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights # alpha_c : 0 if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) # NOTE : why is this not referenced somewhere later? print 'Done' print 'Computing gradient...', grads = tensor.grad(cost, wrt=itemlist(tparams)) print 'Done' if clip_c > 0: grads, not_finite, clipped = gradient_clipping(grads, tparams, clip_c) else: not_finite = 0 clipped = 0 # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print 'Building optimizers...', if re_load and os.path.exists(file_name): if clip_c > 0: f_grad_shared, f_update, toptparams = eval(optimizer)( lr, tparams, grads, inps, cost=cost, not_finite=not_finite, clipped=clipped, file_name=opt_file_name) else: f_grad_shared, f_update, toptparams = eval(optimizer)( lr, tparams, grads, inps, cost=cost, file_name=opt_file_name) else: # re_load = False, clip_c = 1 if clip_c > 0: f_grad_shared, f_update, toptparams = eval(optimizer)( lr, tparams, grads, inps, cost=cost, not_finite=not_finite, clipped=clipped) else: f_grad_shared, f_update, toptparams = eval(optimizer)(lr, tparams, grads, inps, cost=cost) # f_grad_shared = theano.function(inp, [cost, not_finite, clipped], updates=gsup, profile=profile) # f_update = theano.function([lr], [], updates=updates, # on_unused_input='ignore', profile=profile) # toptparams print 'Done' print 'Optimization' best_p = None bad_counter = 0 # will never be true if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size # Training loop ud_start = time.time() estop = False if re_load: # IndexError: index 14 is out of bounds for axis 1 with size 13 print "Checkpointed minibatch number: %d" % cidx for cc in xrange(cidx): if numpy.mod(cc, 1000) == 0: print "Jumping [%d / %d] examples" % (cc, cidx) train.next() for epoch in xrange(max_epochs): time0 = time.time() n_samples = 0 NaN_grad_cnt = 0 NaN_cost_cnt = 0 clipped_cnt = 0 update_idx = 0 if re_load: re_load = 0 else: cidx = 0 for x, y in train: # NOTE : x, y are [sen1, sen2, sen3 ...] where sen_i are of different length update_idx += 1 cidx += 1 uidx += 1 use_noise.set_value(1.) # NOTE : n_x <= batch_size x, x_mask, y, y_mask, n_x = prepare_data(x, y, maxlen=maxlen, maxlen_trg=maxlen_trg, n_words_src=n_words_src, n_words=n_words) n_samples += n_x if x is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 uidx = max(uidx, 0) continue # compute cost, grads and copy grads to shared variables if clip_c > 0: cost, not_finite, clipped = f_grad_shared(x, x_mask, y, y_mask) else: cost = f_grad_shared(x, x_mask, y, y_mask) if clipped: clipped_cnt += 1 # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): import ipdb ipdb.set_trace() NaN_cost_cnt += 1 if not_finite: import ipdb ipdb.set_trace() NaN_grad_cnt += 1 continue # do the update on parameters f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): continue if float(NaN_grad_cnt) > max_epochs * 0.5 or float( NaN_cost_cnt) > max_epochs * 0.5: print 'Too many NaNs, abort training' return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: ud = time.time() - ud_start wps = n_samples / float(time.time() - time0) print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'NaN_in_grad', NaN_grad_cnt,\ 'NaN_in_cost', NaN_cost_cnt, 'Gradient_clipped', clipped_cnt, 'UD ', ud, "%.2f sentence/s" % wps ud_start = time.time() if numpy.mod(uidx, pbatchFreq) == 0 and pbatchFreq != -1: pbatch(x, worddicts_r[0]) # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0 and sampleFreq != -1: gen_list = [ 0, batch_size[0], batch_size[0] + batch_size[1], batch_size[0] + batch_size[1] + batch_size[2] ] gen_list = [ii for ii in gen_list if ii < n_x] for jj in gen_list: # jj = min(5, n_samples) stochastic = True use_noise.set_value(0.) # x : maxlen X n_samples sample, score = gen_sample(tparams, f_init, f_next, x[:, jj][:, None], model_options, trng=trng, k=1, maxlen=maxlen_sample, stochastic=stochastic, argmax=False) print print 'Source ', jj, ': ', if source_word_level: for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: if use_bpe: print(worddicts_r[0][vv]).replace( '@@', ''), else: print worddicts_r[0][vv], else: print 'UNK', print else: source_ = [] for vv in x[:, jj]: if vv == 0: break if vv in worddicts_r[0]: source_.append(worddicts_r[0][vv]) else: source_.append('UNK') print "".join(source_) print 'Truth ', jj, ' : ', if target_word_level: for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: if use_bpe: print(worddicts_r[1][vv]).replace( '@@', ''), else: print worddicts_r[1][vv], else: print 'UNK', print else: truth_ = [] for vv in y[:, jj]: if vv == 0: break if vv in worddicts_r[1]: truth_.append(worddicts_r[1][vv]) else: truth_.append('UNK') print "".join(truth_) print 'Sample ', jj, ': ', if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] if target_word_level: for vv in ss: if vv == 0: break if vv in worddicts_r[1]: if use_bpe: print(worddicts_r[1][vv]).replace( '@@', ''), else: print worddicts_r[1][vv], else: print 'UNK', print else: sample_ = [] for vv in ss: if vv == 0: break if vv in worddicts_r[1]: sample_.append(worddicts_r[1][vv]) else: sample_.append('UNK') print "".join(sample_) print # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: valid_scores = [] for ii, vv in enumerate(valid): use_noise.set_value(0.) # NOTE : when validation, don't pass maxlen, maxlen_trg # meaning, don't limit sentence lengths... # sort of makes sense i suppose? valid_errs = pred_probs( f_log_probs, prepare_data, model_options, vv, verboseFreq=verboseFreq, ) valid_err = valid_errs.mean() valid_scores.append(valid_err) history_errs[ii].append(valid_err) # patience == -1, never happens if len(history_errs[ii]) > patience and valid_err >= \ numpy.array(history_errs[ii])[:-patience].min() and patience != -1: bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break if numpy.isnan(valid_err): import ipdb ipdb.set_trace() cnt = 0 for ii in xrange(4): if uidx == 0 or valid_scores[ii] <= numpy.array( history_errs[ii]).min(): cnt += 1 if len(history_errs[0]) > 1: if numpy.sum(valid_scores) <= numpy.sum( [aa[:-2] for aa in history_errs]): less_sum = True else: less_sum = False else: less_sum = True if cnt >= 2 and less_sum: best_p = unzip(tparams) best_optp = unzip(toptparams) bad_counter = 0 if saveFreq != validFreq and save_best_models: numpy.savez(best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cdix, **best_p) numpy.savez(best_opt_file_name, **best_optp) print 'Valid : DE {}\t CS {}\t FI {}\t RU {}'.format( valid_scores[0], valid_scores[1], valid_scores[2], valid_scores[3]) # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if not os.path.exists(save_path): os.mkdir(save_path) params = unzip(tparams) optparams = unzip(toptparams) numpy.savez(file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) numpy.savez(opt_file_name, **optparams) if save_every_saveFreq and (uidx >= save_burn_in): this_file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx) this_opt_file_name = '%s%s%s.%d.npz' % ( save_path, save_file_name, '.grads', uidx) numpy.savez(this_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) numpy.savez(this_opt_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) if best_p is not None and saveFreq != validFreq: this_best_file_name = '%s%s.%d.best.npz' % ( save_path, save_file_name, uidx) numpy.savez(this_best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **best_p) print 'Done...', print 'Saved to %s' % file_name # finish after this many updates if uidx >= finish_after and finish_after != -1: print 'Finishing after %d iterations!' % uidx estop = True break print 'Seen %d samples' % n_samples lang_nos = (4535523, 12122376, 1926115, 2326893) lang_done = [x * update_idx for x in batch_size] lang_rem = [x - y for x, y in zip(lang_nos, lang_done)] print "Remaining : DE({}), CS({}), FI({}), RU({})".format( lang_rem[0], lang_rem[1], lang_rem[2], lang_rem[3]) eidx += 1 if estop: break use_noise.set_value(0.) valid_scores = [] for ii, vv in enumerate(valid): valid_err = pred_probs(f_log_probs, prepare_data, model_options, vv).mean() valid_scores.append(valid_err) print 'Valid : DE {}\t CS {}\t FI {}\t RU {}'.format( valid_scores[0], valid_scores[1], valid_scores[2], valid_scores[3]) params = unzip(tparams) optparams = unzip(toptparams) file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx) opt_file_name = '%s%s%s.%d.npz' % (save_path, save_file_name, '.grads', uidx) numpy.savez(file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **params) numpy.savez(opt_file_name, **optparams) if best_p is not None and saveFreq != validFreq: best_file_name = '%s%s.%d.best.npz' % (save_path, save_file_name, uidx) best_opt_file_name = '%s%s%s.%d.best.npz' % (save_path, save_file_name, '.grads', uidx) numpy.savez(best_file_name, history_errs=history_errs, uidx=uidx, eidx=eidx, cidx=cidx, **best_p) numpy.savez(best_opt_file_name, **best_optp) return valid_err
def make_training_functions(cfg, model): l_out = model['l_out'] batch_index = T.iscalar('batch_index') # bct01 X = T.TensorType('float32', [False] * 5)('X') y = T.TensorType('int32', [False] * 1)('y') out_shape = lasagne.layers.get_output_shape(l_out) #log.info('output_shape = {}'.format(out_shape)) batch_slice = slice(batch_index * cfg['batch_size'], (batch_index + 1) * cfg['batch_size']) out = lasagne.layers.get_output(l_out, X) dout = lasagne.layers.get_output(l_out, X, deterministic=True) params = lasagne.layers.get_all_params(l_out) l2_norm = lasagne.regularization.regularize_network_params( l_out, lasagne.regularization.l2) if isinstance(cfg['learning_rate'], dict): learning_rate = theano.shared(np.float32(cfg['learning_rate'][0])) else: learning_rate = theano.shared(np.float32(cfg['learning_rate'])) softmax_out = T.nnet.softmax(out) loss = T.cast(T.mean(T.nnet.categorical_crossentropy(softmax_out, y)), 'float32') pred = T.argmax(dout, axis=1) error_rate = T.cast(T.mean(T.neq(pred, y)), 'float32') reg_loss = loss + cfg['reg'] * l2_norm updates = lasagne.updates.momentum(reg_loss, params, learning_rate, cfg['momentum']) X_shared = lasagne.utils.shared_empty(5, dtype='float32') y_shared = lasagne.utils.shared_empty(1, dtype='float32') dout_fn = theano.function([X], dout) pred_fn = theano.function([X], pred) update_iter = theano.function([batch_index], reg_loss, updates=updates, givens={ X: X_shared[batch_slice], y: T.cast(y_shared[batch_slice], 'int32'), }) error_rate_fn = theano.function([batch_index], error_rate, givens={ X: X_shared[batch_slice], y: T.cast(y_shared[batch_slice], 'int32'), }) tfuncs = { 'update_iter': update_iter, 'error_rate': error_rate_fn, 'dout': dout_fn, 'pred': pred_fn, } tvars = { 'X': X, 'y': y, 'X_shared': X_shared, 'y_shared': y_shared, 'batch_slice': batch_slice, 'batch_index': batch_index, 'learning_rate': learning_rate, } return tfuncs, tvars
def prior_z2(samples): return c * T.cast(samples.shape[2], 'float32') / 2 - T.sum(T.sqr(samples), axis=2) / 2
def log_likelihood_sym(self, x_var, dist_info_vars): probs = dist_info_vars["prob"] # Assume layout is N * A return TT.log(TT.sum(probs * TT.cast(x_var, 'float32'), axis=-1) + TINY)
def evaluate( dim_word=620, # word vector dimensionality dim=1000, # the number of LSTM units encoder='gru', decoder='gru_cond', hiero=None, decay_c=0., alpha_c=0., diag_c=0., lrate=0.01, n_words_src=20000, n_words=20000, maxlen=100, # maximum length of the description optimizer='adadelta', batch_size=128, valid_batch_size=128, # Validation and test batch size saveto='./ckt/', dataset='data_iterator', dictionary='', # word dictionary dictionary_src='', # word dictionary use_dropout=False, model=False, correlation_coeff=0.1, clip_c=1., dataset_='opensubs', use_context=False, dataset_size=-1, perplexity=True, BLEU=True): model_options = locals().copy() # Reload previous saved options if model: with open('{}.npz.pkl'.format(model), 'rb') as f: model_options = pkl.load(f) for k, v in model_options.items(): if (k == 'dim_word' or k == 'dim' or k == 'encoder' or k == 'decoder' or k == 'n_words_src' or k == 'n_words' or k == 'optimizer' or k == 'dataset' or k == 'dictionary' or k == 'dictionary_src' or k == 'dataset_' or k == 'use_context' or k == 'dim_context' or k == 'dataset_size'): locals()[k] = v if k not in locals().keys(): locals()[k] = v else: raise ValueError('No model specified') # =================== # LOAD DICTIONARIES # =================== if dictionary: with open(dictionary, 'rb') as f: word_dict = pkl.load(f) else: # Assume dictionary is in the same folder as data if dataset_ == 'opensubs': dictionary = './data/OpenSubsDS/source_train_dict.pkl' elif dataset_ == 'ubuntu': dictionary = './data/UbuntuDS/source_train_dict.pkl' else: raise ValueError('No dictionary specified.') with open(dictionary, 'rb') as f: word_dict = pkl.load(f) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk if dictionary_src: with open(dictionary_src, 'rb') as f: word_dict_src = pkl.load(f) else: # Assume dictionary is in the same folder as data if dataset_ == 'opensubs': dictionary_src = './data/OpenSubsDS/source_train_dict.pkl' elif dataset_ == 'ubuntu': dictionary_src = './data/UbuntuDS/source_train_dict.pkl' else: raise ValueError('No dictionary specified.') with open(dictionary_src, 'rb') as f: word_dict_src = pkl.load(f) word_idict_src = dict() for kk, vv in word_dict_src.iteritems(): word_idict_src[vv] = kk # ======================= # LOAD MODEL PARAMETERS # ======================= print 'Loading data...' load_data, prepare_data = get_dataset(dataset) if dataset_ == 'opensubs': train, valid, test = load_data(train_batch_size=batch_size, val_batch_size=valid_batch_size, test_batch_size=valid_batch_size, use_context=use_context, dataset_size=dataset_size) elif dataset_ == 'ubuntu': train, valid, test = load_data( train_source_path='./data/UbuntuDS/source_train_idx', train_target_path='./data/UbuntuDS/target_train_idx', validation_source_path='./data/UbuntuDS/source_val_idx', validation_target_path='./data/UbuntuDS/target_val_idx', test_source_path='./data/UbuntuDS/source_test_idx', test_target_path='./data/UbuntuDS/target_test_idx', train_batch_size=batch_size, val_batch_size=valid_batch_size, test_batch_size=valid_batch_size, use_context=use_context, context_path={ 'train': './data/UbuntuDS/context_train_idx', 'validation': './data/UbuntuDS/context_val_idx', 'test': './data/UbuntuDS/context_test_idx' }, dataset_size=dataset_size) print 'Building model...' params = init_params(model_options) # reload parameters if model: params = load_params(model, params) else: raise ValueError('No model specified') tparams = init_tparams(params) trng, use_noise, x, x_mask, y, y_mask, conv_context, conv_context_mask, opt_ret, cost = build_model( tparams, model_options) if use_context: inps = [x, x_mask, y, y_mask, conv_context, conv_context_mask] else: inps = [x, x_mask, y, y_mask] # theano.printing.debugprint(cost.mean(), file=open('cost.txt', 'w')) print 'Buliding sampler...' f_init, f_next = build_sampler(tparams, model_options, trng) # Before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, cost, profile=profile) print 'Done' cost = cost.mean() if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg history_errs = [] # reload history if model and os.path.exists(model): history_errs = list(numpy.load(model)['history_errs']) best_p = None bad_count = 0 # after any regularizer print 'Building f_cost...', f_cost = theano.function(inps, cost, profile=profile) print 'Done' uidx = 0 estop = False save_turn = 0 ######################## # Main evaluation loop ######################## if perplexity: print('Evaluating on train') # train_err, train_perplexity = prediction_scores(f_log_probs, # prepare_data, # model_options, # train) # print('Train Cost: {} Train Perplexity: {}'.format(train_err, train_perplexity)) print('Evaluating on validation') valid_err, valid_perplexity = prediction_scores( f_log_probs, prepare_data, model_options, valid) print('Valid Cost: {} Valid Perplexity: {}'.format( valid_err, valid_perplexity)) print('Evaluating on test') test_err, test_perplexity = prediction_scores(f_log_probs, prepare_data, model_options, test) print('Test Cost: {} Test Perplexity: {}'.format( test_err, test_perplexity)) stochastic = False if BLEU: references = [] hypotheses = [] for x, y, in valid: references.append([[str(i) for i in y]]) sample, score = gen_sample(tparams, f_init, f_next, x[:, None], model_options, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=True) hypotheses.append([str(i) for i in sample]) valid_BLEU = corpus_bleu(references, hypotheses) print('Validation BLEU: '.format(valid_BLEU)) references = [] hypotheses = [] for x, y, conv_context in test: references.append([[str(i) for i in y]]) sample, score = gen_sample(tparams, f_init, f_next, x[:, None], model_options, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=True) hypotheses.append([str(i) for i in sample]) test_BLEU = corpus_bleu(references, hypotheses) print('Test BLEU: '.format(test_BLEU)) for i, x in enumerate(source_utterances): stochastic = False sample, score = gen_sample(tparams, f_init, f_next, x[:, None], model_options, trng=trng, k=1, maxlen=30, stochastic=stochastic, argmax=True) print('Source {}: '.format(i) + print_utterance(x, word_idict)) if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] print('Sample {}:'.format(i) + print_utterance(ss, word_idict))
def setup(self, params, gparams, shapes=None, max_norm=5.0, lr=0.01, eps=1e-6, rho=0.95, method="ADADELTA", beta=0.0, count=None, weight_l2=0): # Setup only once assert not self.updates if not shapes: shapes = params if not count: count = T.constant(1, dtype=FLOATX) else: count = T.cast(count, FLOATX) gcache = [ theano.shared(np.zeros_like(param.get_value(borrow=True), dtype=FLOATX), name="gcache_%s" % param.name) for param in shapes ] gcache_mean = [g / self.batch_counter for g in gcache] optimize_updates = optimize_parameters(params, gcache_mean, shapes, max_norm, lr, eps, rho, method, beta, gsum_regularization=0.0001, weight_l2=weight_l2, clip=self.clip) self.updates.extend(optimize_updates) self.caches.extend(gcache) if self.realtime: # Realtime update needs_update = self.batch_counter >= T.constant(self.batch_size) update_dict = OrderedDict() for param, update_val in optimize_updates: update_dict[param] = ifelse(needs_update, update_val, param) for cache, g in zip(gcache, gparams): update_dict[cache] = ifelse(needs_update, g, cache + g) update_dict[self.batch_counter] = ifelse( needs_update, count, self.batch_counter + count) return update_dict.items() else: # Manual update, perhaps in the end of one iteration gcache_updates = [(c, c + g) for c, g in zip(gcache, gparams)] + [ (self.batch_counter, self.batch_counter + count) ] return gcache_updates
for p, u in updates] if __name__ == "__main__": P = Parameters() extract, _ = model.build(P, "vrnn") X = T.tensor3('X') l = T.ivector('l') [Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std] = extract(X, l) parameters = P.values() batch_cost = model.cost(X, Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std, l) print "Calculating gradient..." print parameters batch_size = T.cast(X.shape[1], 'float32') gradients = T.grad(batch_cost, wrt=parameters) gradients = [g / batch_size for g in gradients] gradients = clip(5, parameters, gradients) P_learn = Parameters() updates = updates.adam(parameters, gradients, learning_rate=0.00025, P=P_learn) updates = normalise_weights(updates) print "Compiling..." train = theano.function( inputs=[X, l],
def run_cnn(exp_name, dataset, embedding, log_fn, perf_fn, emb_dm=100, batch_size=100, filter_hs=[1, 2, 3], hidden_units=[200, 100, 11], type_hidden_units=[200, 100, 6], dropout_rate=0.5, shuffle_batch=True, n_epochs=300, lr_decay=0.95, activation=ReLU, sqr_norm_lim=9, non_static=True, print_freq=5, sen_reg=False, L2=False): """ Train and Evaluate CNN event encoder model :dataset: list containing three elements[(train_x, train_y), (valid_x, valid_y), (test_x, test_y)] :embedding: word embedding with shape (|V| * emb_dm) :filter_hs: filter height for each paralle cnn layer :dropout_rate: dropout rate for full connected layers :n_epochs: the max number of iterations """ start_time = timeit.default_timer() rng = np.random.RandomState(1234) input_height = len(dataset[0][0][0][0]) num_sens = len(dataset[0][0][0]) print "--input height ", input_height input_width = emb_dm num_maps = hidden_units[0] ################### # start snippet 1 # ################### print "start to construct the model ...." x = T.tensor3("x") type_y = T.ivector("y_type") pop_y = T.ivector("y_pop") words = shared(value=np.asarray(embedding, dtype=theano.config.floatX), name="embedding", borrow=True) # define function to keep padding vector as zero zero_vector_tensor = T.vector() zero_vec = np.zeros(input_width, dtype=theano.config.floatX) set_zero = function([zero_vector_tensor], updates=[(words, T.set_subtensor(words[0, :], zero_vector_tensor))]) layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape( (x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm)) ######################### # Construct Sen Vec ##### ######################### conv_layers = [] filter_shape = (num_maps, 1, filter_hs[0], emb_dm) pool_size = (input_height - filter_hs[0] + 1, 1) conv_layer = nn.ConvPoolLayer(rng, input=layer0_input, input_shape=None, filter_shape=filter_shape, pool_size=pool_size, activation=activation) # make the sentence vector maxtrix sen_vecs = conv_layer.output.reshape((x.shape[0] * x.shape[1], num_maps)) conv_layers.append(conv_layer) ######################## ## Task 1: populaiton### ######################## pop_layer_sizes = zip(hidden_units, hidden_units[1:]) pop_layer_input = sen_vecs pop_drop_input = sen_vecs pop_hidden_outs = [] pop_drop_outs = [] pop_hidden_layers = [] pop_drop_layers = [] droprate = 0.5 for layer_size in pop_layer_sizes[:-1]: U_value = np.random.random(layer_size).astype(theano.config.floatX) b_value = np.zeros((layer_size[-1], ), dtype=theano.config.floatX) U = theano.shared(U_value, borrow=True, name="U") b = theano.shared(b_value, borrow=True, name="b") pop_hidden_layer = nn.HiddenLayer(rng, pop_layer_input, layer_size[0], layer_size[1], ReLU, U * (1 - droprate), b) pop_drop_hidden_layer = nn.DropoutHiddenLayer(rng, pop_drop_input, layer_size[0], layer_size[1], ReLU, droprate, U, b) pop_hidden_layers.append(pop_hidden_layer) pop_drop_layers.append(pop_drop_hidden_layer) pop_hidden_out = pop_hidden_layer.output pop_drop_out = pop_drop_hidden_layer.output pop_layer_input = pop_hidden_out pop_drop_input = pop_drop_out pop_hidden_outs.append(pop_hidden_out) pop_drop_outs.append(pop_drop_out) # construct pop classifier n_in, n_out = pop_layer_sizes[-1] W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX) b_value = np.zeros((n_out, ), dtype=theano.config.floatX) pop_W = theano.shared(W_value, borrow=True, name="pop_W") pop_b = theano.shared(b_value, borrow=True, name="pop_b") pop_act = T.dot(pop_hidden_outs[-1], pop_W * (1 - droprate)) + pop_b pop_drop_act = T.dot(pop_drop_outs[-1], pop_W) + pop_b sen_pop_probs = T.nnet.softmax(pop_act) sen_drop_pop_probs = T.nnet.softmax(pop_drop_act) pop_probs = T.mean(sen_pop_probs.reshape((x.shape[0], x.shape[1], n_out)), axis=1) pop_drop_probs = T.mean(sen_drop_pop_probs.reshape( (x.shape[0], x.shape[1], n_out)), axis=1) pop_y_pred = T.argmax(pop_probs, axis=1) pop_drop_y_pred = T.argmax(pop_drop_probs, axis=1) pop_neg_loglikelihood = -T.mean( T.log(pop_probs)[T.arange(pop_y.shape[0]), pop_y]) pop_drop_neg_loglikelihood = -T.mean( T.log(pop_drop_probs)[T.arange(pop_y.shape[0]), pop_y]) pop_errors = T.mean(T.neq(pop_y_pred, pop_y)) pop_errors_detail = T.neq(pop_y_pred, pop_y) pop_cost = pop_neg_loglikelihood pop_drop_cost = pop_drop_neg_loglikelihood ######################## ## Task 1: event type### ######################## type_layer_sizes = zip(type_hidden_units, type_hidden_units[1:]) type_layer_input = sen_vecs type_drop_input = sen_vecs type_hidden_outs = [] type_drop_outs = [] type_hidden_layers = [] type_drop_layers = [] droprate = 0.5 for layer_size in type_layer_sizes[:-1]: U_value = np.random.random(layer_size).astype(theano.config.floatX) b_value = np.zeros((layer_size[-1], ), dtype=theano.config.floatX) U = theano.shared(U_value, borrow=True, name="U") b = theano.shared(b_value, borrow=True, name="b") type_hidden_layer = nn.HiddenLayer(rng, type_layer_input, layer_size[0], layer_size[1], ReLU, U * (1 - droprate), b) type_drop_hidden_layer = nn.DropoutHiddenLayer(rng, type_drop_input, layer_size[0], layer_size[1], ReLU, droprate, U, b) type_hidden_layers.append(type_hidden_layer) type_drop_layers.append(type_drop_hidden_layer) type_hidden_out = type_hidden_layer.output type_drop_out = type_drop_hidden_layer.output type_layer_input = type_hidden_out type_drop_input = type_drop_out type_hidden_outs.append(type_hidden_out) type_drop_outs.append(type_drop_out) # construct pop classifier n_in, n_out = type_layer_sizes[-1] W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX) b_value = np.zeros((n_out, ), dtype=theano.config.floatX) type_W = theano.shared(W_value, borrow=True, name="pop_W") type_b = theano.shared(b_value, borrow=True, name="pop_b") type_act = T.dot(type_hidden_outs[-1], type_W * (1 - droprate)) + type_b type_drop_act = T.dot(type_drop_outs[-1], type_W) + type_b #type_probs = T.nnet.softmax(type_max_act) #type_drop_probs = T.nnet.softmax(type_drop_max_act) sen_type_probs = T.nnet.softmax(type_act) sen_drop_type_probs = T.nnet.softmax(type_drop_act) type_probs = T.mean(sen_type_probs.reshape( (x.shape[0], x.shape[1], n_out)), axis=1) type_drop_probs = T.mean(sen_drop_type_probs.reshape( (x.shape[0], x.shape[1], n_out)), axis=1) type_y_pred = T.argmax(type_probs, axis=1) type_drop_y_pred = T.argmax(type_drop_probs, axis=1) type_neg_loglikelihood = -T.mean( T.log(type_probs)[T.arange(type_y.shape[0]), type_y]) type_drop_neg_loglikelihood = -T.mean( T.log(type_drop_probs)[T.arange(type_y.shape[0]), type_y]) type_errors = T.mean(T.neq(type_y_pred, type_y)) type_errors_detail = T.neq(type_y_pred, type_y) type_cost = type_neg_loglikelihood type_drop_cost = type_drop_neg_loglikelihood ################################## # Collect all the parameters ##### ################################## params = [] # convolution layer params for conv_layer in conv_layers: params += conv_layer.params # params for population task for layer in pop_drop_layers: params += layer.params params.append(pop_W) params.append(pop_b) # params for event type task for layer in type_drop_layers: params += layer.params params.append(type_W) params.append(type_b) if non_static: params.append(words) total_cost = pop_cost + type_cost total_drop_cost = pop_drop_cost + type_drop_cost if L2: l2_norm = 0.1 * T.sum(pop_W**2) + 0.1 * T.sum(type_W**2) for drop_layer in type_drop_layers: l2_norm += 0.1 * T.sum(drop_layer.W**2) for drop_layer in pop_drop_layers: l2_norm += 0.1 * T.sum(drop_layer.W**2) total_cost += l2_norm total_drop_cost += l2_norm total_grad_updates = sgd_updates_adadelta(params, total_drop_cost, lr_decay, 1e-6, sqr_norm_lim) total_preds = [pop_y_pred, type_y_pred] total_errors_details = [pop_errors_detail, type_errors_detail] total_out = total_preds + total_errors_details ##################### # Construct Dataset # ##################### print "Copy data to GPU and constrct train/valid/test func" np.random.seed(1234) train_x, train_pop_y, train_type_y = shared_dataset(dataset[0]) valid_x, valid_pop_y, valid_type_y = shared_dataset(dataset[1]) test_x, test_pop_y, test_type_y = shared_dataset(dataset[2]) n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size)) n_valid_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size)) n_test_batches = int(np.ceil(1.0 * len(dataset[2][0]) / batch_size)) ##################### # Train model func # ##################### index = T.iscalar() train_func = function( [index], total_drop_cost, updates=total_grad_updates, givens={ x: train_x[index * batch_size:(index + 1) * batch_size], pop_y: train_pop_y[index * batch_size:(index + 1) * batch_size], type_y: train_type_y[index * batch_size:(index + 1) * batch_size] }) valid_train_func = function( [index], total_drop_cost, updates=total_grad_updates, givens={ x: valid_x[index * batch_size:(index + 1) * batch_size], pop_y: valid_pop_y[index * batch_size:(index + 1) * batch_size], type_y: valid_type_y[index * batch_size:(index + 1) * batch_size] }) test_pred_detail = function( [index], total_out, givens={ x: test_x[index * batch_size:(index + 1) * batch_size], pop_y: test_pop_y[index * batch_size:(index + 1) * batch_size], type_y: test_type_y[index * batch_size:(index + 1) * batch_size] }) # apply early stop strategy patience = 100 patience_increase = 2 improvement_threshold = 1.005 n_valid = len(dataset[1][0]) n_test = len(dataset[2][0]) epoch = 0 best_params = None best_validation_score = 0. test_perf = 0 done_loop = False log_file = open(log_fn, 'w') print "Start to train the model....." total_score = 0.0 while (epoch < n_epochs) and not done_loop: start_time = timeit.default_timer() epoch += 1 costs = [] for minibatch_index in np.random.permutation(range(n_train_batches)): cost_epoch = train_func(minibatch_index) costs.append(cost_epoch) set_zero(zero_vec) # do validatiovalidn valid_cost = [ valid_train_func(i) for i in np.random.permutation(xrange(n_valid_batches)) ] if epoch % print_freq == 0: # do test pop_preds = [] type_preds = [] pop_errors = [] type_errors = [] pop_sens = [] type_sens = [] for i in xrange(n_test_batches): test_pop_pred, test_type_pred, test_pop_error, test_type_error = test_pred_detail( i) pop_preds.append(test_pop_pred) type_preds.append(test_type_pred) pop_errors.append(test_pop_error) type_errors.append(test_type_error) pop_preds = np.concatenate(pop_preds) type_preds = np.concatenate(type_preds) pop_errors = np.concatenate(pop_errors) type_errors = np.concatenate(type_errors) pop_perf = 1 - np.mean(pop_errors) type_perf = 1 - np.mean(type_errors) # dumps the predictions and the choosed sentences with open( os.path.join(perf_fn, "%s_%d.pop_pred" % (exp_name, epoch)), 'w') as epf: for p in pop_preds: epf.write("%d\n" % int(p)) with open( os.path.join(perf_fn, "%s_%d.type_pred" % (exp_name, epoch)), 'w') as epf: for p in type_preds: epf.write("%d\n" % int(p)) message = "Epoch %d test pop perf %f, type perf %f, training_cost %f" % ( epoch, pop_perf, type_perf, np.mean(costs)) print message log_file.write(message + "\n") log_file.flush() if (pop_perf + type_perf) > total_score: total_score = pop_perf + type_perf # save the model model_name = os.path.join( perf_fn, "%s_%d.best_model" % (exp_name, epoch)) with open(model_name, 'wb') as mn: for param in params: cPickle.dump(param.get_value(), mn) end_time = timeit.default_timer() print "Finish one iteration using %f m" % ( (end_time - start_time) / 60.) # output the final model params print "Output the final model" model_name = os.path.join(perf_fn, "%s_%d.final_model" % (exp_name, epoch)) with open(model_name, 'wb') as mn: for param in params: cPickle.dump(param.get_value(), mn) log_file.flush() log_file.close()
def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): old_prob_var = old_dist_info_vars["prob"] new_prob_var = new_dist_info_vars["prob"] x_var = TT.cast(x_var, 'float32') # Assume layout is N * A return (TT.sum(new_prob_var * x_var, axis=-1) + TINY) / (TT.sum(old_prob_var * x_var, axis=-1) + TINY)
def step(char_lm1, char_l, trans_probs_l): """Probability of going from char_lm1 to char_l using trans_probs_l tensor""" char_lm1 = T.cast(char_lm1, 'int32') char_l = T.cast(char_l, 'int32') return trans_probs_l[T.arange(N), char_lm1, char_l] # N
import theano import theano.tensor as T import numpy as np if __name__ == '__main__': x = np.asarray([[1, 2], [1, 2]], dtype='float32') lens = x.shape[0] y = np.zeros(lens, dtype='float32') z = np.full(lens, 1, dtype='float32') ll = np.asarray([1, 0], dtype='int32') lll = np.asarray([[1, 2, 5], [2, 3, 4]], dtype='float32') zero = T.vector('zero', dtype='float32') margin = T.vector('margin', dtype='float32') cos12 = T.matrix(dtype='float32') label = T.vector(dtype='int32') # T.reshape(label,(label.shape[0],1)) diff = T.cast(T.maximum(zero, margin - cos12[:, label]), dtype='float32') cost = T.sum(diff, acc_dtype='float32') f = theano.function([cos12, zero, margin, label], diff) print f(x, y, z, ll) print x[:, 0]
def train_qacnn( datasets, U, # pre-trained word embeddings filter_hs=[2], # filter width hidden_units=[100, 2], shuffle_batch=True, n_epochs=25, lam=0, batch_size=20, lr_decay=0.95, # for AdaDelta sqr_norm_lim=9): # for optimization """ return: a list of dicts of lists, each list contains (ansId, groundTruth, prediction) for a question """ rng = np.random.RandomState(3435) img_h = (len(datasets[0][0]) - 3) / 2 img_w = U.shape[1] lsize, rsize = img_h, img_h filter_w = img_w feature_maps = hidden_units[0] filter_shapes = [] pool_sizes = [] for filter_h in filter_hs: filter_shapes.append((feature_maps, 1, filter_h, filter_w)) pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) parameters = [("image shape", img_h, img_w), ("filter shape", filter_shapes), ("hidden_units", hidden_units), ("batch_size", batch_size), ("lambda", lam), ("learn_decay", lr_decay), ("sqr_norm_lim", sqr_norm_lim), ("shuffle_batch", shuffle_batch)] print parameters # define model architecture index = T.lscalar() lx = T.matrix('lx') rx = T.matrix('rx') y = T.ivector('y') Words = theano.shared(value=U, name="Words") llayer0_input = Words[T.cast(lx.flatten(), dtype="int32")].reshape( (lx.shape[0], 1, lx.shape[1], Words.shape[1])) # input: word embeddings of the mini batch rlayer0_input = Words[T.cast(rx.flatten(), dtype="int32")].reshape( (rx.shape[0], 1, rx.shape[1], Words.shape[1])) # input: word embeddings of the mini batch conv_layers = [] # layer number = filter number llayer1_inputs = [] # layer number = filter number rlayer1_inputs = [] # layer number = filter number for i in xrange(len(filter_hs)): filter_shape = filter_shapes[i] pool_size = pool_sizes[i] conv_layer = QALeNetConvPoolLayer(rng, linp=llayer0_input, rinp=rlayer0_input, filter_shape=filter_shape, poolsize=pool_size) llayer1_input = conv_layer.loutput.flatten(2) rlayer1_input = conv_layer.routput.flatten(2) conv_layers.append(conv_layer) llayer1_inputs.append(llayer1_input) rlayer1_inputs.append(rlayer1_input) llayer1_input = T.concatenate( llayer1_inputs, 1) # concatenate representations of different filters rlayer1_input = T.concatenate( rlayer1_inputs, 1) # concatenate representations of different filters hidden_units[0] = feature_maps * len(filter_hs) classifier = BilinearLR(llayer1_input, rlayer1_input, hidden_units[0], hidden_units[0]) params = classifier.params for conv_layer in conv_layers: params += conv_layer.params L2_sqr = 0. for param in params: L2_sqr += (param**2).sum() cost = classifier.get_cost(y) + lam * L2_sqr grad_updates = sgd_updates_adadelta(params, cost, lr_decay, 1e-6, sqr_norm_lim) # shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate # extra data (at random) np.random.seed(3435) if datasets[0].shape[0] % batch_size > 0: extra_data_num = batch_size - datasets[0].shape[0] % batch_size train_set = np.random.permutation(datasets[0]) extra_data = train_set[:extra_data_num] new_data = np.append(datasets[0], extra_data, axis=0) else: new_data = datasets[0] new_data = np.random.permutation(new_data) n_train_batches = new_data.shape[0] / batch_size train_set, train_set_orig, val_set, test_set = new_data, datasets[ 0], datasets[1], datasets[2] train_set_lx = theano.shared(np.asarray(train_set[:, :lsize], dtype=theano.config.floatX), borrow=True) train_set_rx = theano.shared(np.asarray(train_set[:, lsize:lsize + rsize], dtype=theano.config.floatX), borrow=True) train_set_y = theano.shared(np.asarray(train_set[:, -1], dtype="int32"), borrow=True) train_set_lx_orig, train_set_rx_orig, train_set_qid_orig, train_set_aid_orig, train_set_y_orig = train_set_orig[:, :lsize], train_set_orig[:, lsize:lsize + rsize], np.asarray( train_set_orig[:, -3], dtype="int32"), np.asarray( train_set_orig[:, -2], dtype="int32"), np.asarray(train_set_orig[:, -1], dtype="int32") val_set_lx, val_set_rx, val_set_qid, val_set_aid, val_set_y = val_set[:, :lsize], val_set[:, lsize:lsize + rsize], np.asarray( val_set[:, -3], dtype="int32"), np.asarray(val_set[:, -2], dtype="int32"), np.asarray(val_set[:, -1], dtype="int32") test_set_lx, test_set_rx, test_set_qid, test_set_aid, test_set_y = test_set[:, :lsize], test_set[:, lsize:lsize + rsize], np.asarray( test_set[:, -3], dtype="int32"), np.asarray(test_set[:, -2], dtype="int32"), np.asarray(test_set[:, -1], dtype="int32") train_model = theano.function( [index], cost, updates=grad_updates, givens={ lx: train_set_lx[index * batch_size:(index + 1) * batch_size], rx: train_set_rx[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) test_lpred_layers = [] test_rpred_layers = [] test_llayer0_input = Words[T.cast(lx.flatten(), dtype="int32")].reshape( (lx.shape[0], 1, img_h, Words.shape[1])) test_rlayer0_input = Words[T.cast(rx.flatten(), dtype="int32")].reshape( (rx.shape[0], 1, img_h, Words.shape[1])) for conv_layer in conv_layers: test_llayer0_output, test_rlayer0_output = conv_layer.predict( test_llayer0_input, test_rlayer0_input) test_lpred_layers.append(test_llayer0_output.flatten(2)) test_rpred_layers.append(test_rlayer0_output.flatten(2)) test_llayer1_input = T.concatenate(test_lpred_layers, 1) test_rlayer1_input = T.concatenate(test_rpred_layers, 1) test_y_pred = classifier.predict(test_llayer1_input, test_rlayer1_input) test_model = theano.function([lx, rx], test_y_pred) #start training over mini-batches print '... training' epoch = 0 cost_epoch = 0 train_preds_epos, dev_preds_epos, test_preds_epos = [], [], [] while (epoch < n_epochs): epoch = epoch + 1 total_cost = 0 if shuffle_batch: for minibatch_index in np.random.permutation( range(n_train_batches)): cost_epoch = train_model(minibatch_index) total_cost += cost_epoch else: for minibatch_index in xrange(n_train_batches): cost_epoch = train_model(minibatch_index) total_cost += cost_epoch print "epoch = %d, cost = %f" % (epoch, total_cost) train_preds, dev_preds, test_preds = defaultdict(list), defaultdict( list), defaultdict(list) ypred = test_model(train_set_lx_orig, train_set_rx_orig) for i, pr in enumerate(ypred): qid, aid, y = train_set_qid_orig[i], train_set_aid_orig[ i], train_set_y_orig[i] train_preds[qid].append((aid, y, pr)) ypred = test_model(val_set_lx, val_set_rx) for i, pr in enumerate(ypred): qid, aid, y = val_set_qid[i], val_set_aid[i], val_set_y[i] dev_preds[qid].append((aid, y, pr)) ypred = test_model(test_set_lx, test_set_rx) for i, pr in enumerate(ypred): qid, aid, y = test_set_qid[i], test_set_aid[i], test_set_y[i] test_preds[qid].append((aid, y, pr)) train_preds_epos.append(train_preds) dev_preds_epos.append(dev_preds) test_preds_epos.append(test_preds) return train_preds_epos, dev_preds_epos, test_preds_epos
def viterbi(trans_probs): """Using the canvas C generate the most probable sequence using the usual viterbi algorithm and an input x as the observed data, generate the most probable latent sequence using the viterbi updates given the transition tensor trans_probs over all of the N given sentences :param trans_probs: N * max(L) * D * D tensor :return: characters and the probabilities for each of these """ N = trans_probs.shape[0] D = trans_probs.shape[-1] # T1_0 has to be samples differently since it has no precedent character # to index the row in the transition tensor T1_0 = trans_probs[:, 0, 0] # N * D matrix T2_0 = T.zeros((N, D)) # N * D matrix # forward step in viterbi algorithm def step_forward(trans_probs_l, T1_lm1): T1_l = T.max(T.shape_padright(T1_lm1) * trans_probs_l, axis=1) # N * D matrix T2_l = T.argmax(T.shape_padright(T1_lm1) * trans_probs_l, axis=1) # N * D matrix return T.cast(T1_l, 'float32'), T.cast(T2_l, 'float32') ([T1, T2], _) = theano.scan( step_forward, sequences=trans_probs[:, 1:].dimshuffle((1, 0, 2, 3)), outputs_info=[T1_0, None], ) # (max(L)-1) * N * D tensors # concatenate initial sample with the rest to get full path T1 = T.concatenate([T.shape_padleft(T1_0), T1], axis=0) # max(L) * N * D T2 = T.concatenate([T.shape_padleft(T2_0), T2], axis=0) # max(L) * N * D char_L = T.cast(T.argmax(T1[-1], axis=1), 'float32') # N # backward step in viterbi algorithm to find the actual sequence def step_backward(T2_lp1, char_lp1): char_l = T2_lp1[T.arange(N), T.cast(char_lp1, 'int32')] # N return T.cast(char_l, 'float32') chars, _ = theano.scan( step_backward, sequences=T2[1:][::-1], outputs_info=[char_L], ) # (max(L)-1) * N chars = chars[::-1] # (max(L)-1) * N chars = T.concatenate([chars, T.shape_padleft(char_L)], axis=0).T # N * max(L) probs = get_probs(chars, trans_probs) # N * max(L) return chars, probs # N * max(L) and N * max(L)
def train(learning_rate=0.1, n_epochs=100, batch_size=320, batch_type = 'fast', mynet = 'one', representation='raw', momentum=0, history=0): rng = numpy.random.RandomState(42) trainP = 0.8 validP = 0.1 testP = 0.1 # print "... Reading cached values ..." # (trainCumLengths,validCumLengths,testCumLengths,filenames) = pickle.load(open("results/5x5.cache",'r')) print "... Getting filenames ..." datasetMY = "../MC player/20kgames9" fn1 = readGame.getFilenames(datasetMY,1,0,1)[0] random.shuffle(fn1) filenames = fn1 n = len(filenames) print "... Learning set contains " + str(n) + " games" print "... Computing cumulative game lengths ..." trainNames = filenames[:int(trainP*n)] validNames = filenames[int(trainP*n):int(trainP*n+validP*n)] testNames = filenames[int(trainP*n+validP*n):int(trainP*n+validP*n+testP*n)] random.shuffle(trainNames) trainCumLengths = readGame.getCumGameLengths(trainNames,ftype="game") validCumLengths = readGame.getCumGameLengths(validNames,ftype="game") testCumLengths = readGame.getCumGameLengths(testNames,ftype="game") fw = open("results/"+str(gs)+"x"+str(gs)+".cache","wb") pickle.dump((trainCumLengths,validCumLengths,testCumLengths,filenames),fw) fw.close() print "... Preprocessing initial batches ..." minn = batch_size / 10 +1 temp = time.time() test_batch_x, test_batch_y = utils.shared_dataset(readGame.processGAMEs(testNames[:minn],representation,gs=gs),batch_size=batch_size,board_size=gs) train_batch_x, train_batch_y = utils.shared_dataset(readGame.processGAMEs(trainNames[:minn],representation,gs=gs),batch_size=batch_size,board_size=gs) valid_batch_x, valid_batch_y = utils.shared_dataset(readGame.processGAMEs(validNames[:minn],representation,gs=gs),batch_size=batch_size,board_size=gs) print " average processing time per game: " + str((time.time()-temp)/18.0) + " seconds, per epoch: " + str(int((time.time()-temp)/18*n/60/60)) + " hours" # compute number of minibatches for training, validation and testing n_train_batches = trainCumLengths[-1] n_valid_batches = validCumLengths[-1] n_test_batches = testCumLengths[-1] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data iteration = T.lscalar() # iteration number of a minibatch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (gs, gs) # this is the size of MNIST images fw = open("results/"+mynet+"_"+str(learning_rate)+"_"+".res","w") ###################### # BUILD ACTUAL MODEL # ###################### print '... Building the model ...' nc = 2 if representation=='raw' else 6 # if raw nc *= 1+history if mynet == "zero": layer0_input = x.reshape((batch_size, nc, gs, gs)) layer0 = LogisticRegression(input=layer0_input.flatten(2), n_in=nc*gs*gs, n_out=gs*gs) cost = layer0.negative_log_likelihood(y) params = layer0.params if mynet == "one": nHiddens = 500 layer1_input = x.reshape((batch_size, nc, gs, gs)) layer1 = HiddenLayer(rng, input=layer1_input.flatten(2), n_in=nc * gs * gs, n_out=nHiddens, activation=T.tanh) layer0 = LogisticRegression(input=layer1.output, n_in=nHiddens, n_out=gs*gs) cost = layer0.negative_log_likelihood(y) params = layer0.params + layer1.params # create a function to compute the mistakes that are made by the model test_model = theano.function([], layer0.errors(y), givens={ x: test_batch_x, y: T.cast(test_batch_y, 'int32')}) validate_model = theano.function([], layer0.errors(y), givens={ x: valid_batch_x, y: T.cast(valid_batch_y, 'int32')}) predictions = theano.function([], layer0.get_predictions(), givens={ x: valid_batch_x}) conditional_dist = theano.function([], layer0.get_conditional_dist(), givens={ x: valid_batch_x}) # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] #adjusted_rate = learning_rate - iteration*(learning_rate/(float(n_epochs) * n_train_batches)) adjusted_rate = learning_rate if T.lt(iteration,3000*200) else 0.1*learning_rate for param_i, grad_i in zip(params, grads):#, prev_grad_i , prevGrads): updates.append((param_i, param_i - adjusted_rate * grad_i))# - momentum * prev_grad_i)) #for i,grad in enumerate(grads): # updates.append((prevGrads[i], grad)) train_model = theano.function([iteration], cost, updates=updates, givens={ x: train_batch_x, y: T.cast(train_batch_y, 'int32')},on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... Training ...' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.999 # a relative improvement of this much is # considered significant validation_frequency = 2000 # min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False stime = time.time() while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 500 == 0: print 'training @ iter = ', iter pickle.dump((updates,cost,layer0,test_model,predictions,conditional_dist),open("results/"+str(batch_size)+representation+str(history)+".model","w")) if iter ==5: print 'estimated train time per epoch = '+ str((time.time() - stime) * n_train_batches/60.0/iter/60.0) + " hours" ax,ay = getBatch(trainNames, minibatch_index, trainCumLengths, batch_size,representation,batchType=batch_type,history=history) train_batch_x.set_value(ax) train_batch_y.set_value(ay) cost_ij = train_model(iter) if (iter + 1) % validation_frequency == 0 or iter==5: # compute zero-one loss on validation set validation_losses = [] for i in xrange(n_valid_batches): vx,vy = getBatch(validNames, i, validCumLengths, batch_size,representation,batchType='fast',history=history) valid_batch_x.set_value(vx) valid_batch_y.set_value(vy) validation_losses.append(validate_model()) this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses=[] for i in xrange(n_test_batches): tx,ty = getBatch(testNames, i, testCumLengths, batch_size,representation,batchType='fast',history=history) test_batch_x.set_value(tx) test_batch_y.set_value(ty) test_losses.append(test_model()) test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) #fw.write("Epoch "+str(epoch) + ": " +str((1-this_validation_loss)*100.)+ "%\n") pickle.dump((updates,cost,layer0,test_model,predictions,conditional_dist),open("results/"+str(batch_size)+representation+str(history)+".model","w")) #if patience <= iter: # done_looping = True # break fw.close() end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def step_backward(T2_lp1, char_lp1): char_l = T2_lp1[T.arange(N), T.cast(char_lp1, 'int32')] # N return T.cast(char_l, 'float32')