Example #1
0
def ADAM(classifier, cost, lr, updates):
    t = theano.shared(numpy.int(1))
    alpha = lr
    beta_1 = 0.9
    beta_2 = 0.999
    epsilon = 1.0 * 10 ** -8.0
    lam = 1.0 - 1.0 * 10 ** -8.0
    g_model_params = []
    models_m = []
    models_v = []
    for param in classifier.params:
        gparam = T.grad(cost, wrt=param)
        g_model_params.append(gparam)
        m = theano.shared(numpy.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX))
        v = theano.shared(numpy.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX))
        models_m.append(m)
        models_v.append(v)
    for param, gparam, m, v in zip(classifier.params, g_model_params, models_m, models_v):
        beta_1_t = T.cast(beta_1 * lam ** (t - 1), theano.config.floatX)
        updates[m] = beta_1_t * m + (numpy.float(1.0) - beta_1_t) * gparam
        updates[v] = beta_2 * v + (1 - beta_2) * (gparam * gparam)
        m_hat = updates[m] / (1.0 - T.cast(beta_1 ** t, theano.config.floatX))
        v_hat = updates[v] / (1.0 - T.cast(beta_2 ** t, theano.config.floatX))
        updates[param] = param - alpha * m_hat / (T.sqrt(v_hat) + epsilon)
    updates[t] = t + 1
    return updates
Example #2
0
def local_gpua_advanced_incsubtensor(node, context_name):

    # This is disabled on non-cuda contexts
    if get_context(context_name).kind != "cuda":
        return None

    x, y, ilist = node.inputs

    # Gpu Ops needs both inputs to have the same dtype
    if x.type.dtype != y.type.dtype:
        dtype = scalar.upcast(x.type.dtype, y.type.dtype)
        if x.type.dtype != dtype:
            x = tensor.cast(x, dtype)
        if y.type.dtype != dtype:
            y = tensor.cast(y, dtype)

    set_instead_of_inc = node.op.set_instead_of_inc
    active_device_no = theano.sandbox.cuda.active_device_number()
    device_properties = theano.sandbox.cuda.device_properties

    compute_capability = device_properties(active_device_no)["major"]

    if compute_capability < 2 or x.ndim != 2 or y.ndim != 2:
        return GpuAdvancedIncSubtensor1(set_instead_of_inc=set_instead_of_inc)
    else:
        return GpuAdvancedIncSubtensor1_dev20(set_instead_of_inc=set_instead_of_inc)
Example #3
0
def _linspace(start, stop, num):
    # Theano linspace. Behaves similar to np.linspace
    start = T.cast(start, theano.config.floatX)
    stop = T.cast(stop, theano.config.floatX)
    num = T.cast(num, theano.config.floatX)
    step = (stop-start)/(num-1)
    return T.arange(num, dtype=theano.config.floatX)*step+start
Example #4
0
def load_data(random_state=1066, n=1000, max_phrase_length=100):
    data = utils.load_data(random_state=random_state,
                           n=n,
                           max_phrase_length=max_phrase_length)

    X_train, y_train = data[0]
    X_valid, y_valid = data[1]
    X_test, y_test = data[2]

    X_train = X_train.reshape((-1, max_phrase_length, 67)).transpose(0, 2, 1)
    X_valid = X_valid.reshape((-1, max_phrase_length, 67)).transpose(0, 2, 1)
    X_test = X_test.reshape((-1, max_phrase_length, 67)).transpose(0, 2, 1)

    # Robert: what about reshaping this data for 1D convs?
    # hstack() instead of hstack() in when creatign X in utils?

    return dict(
        X_train=theano.shared(lasagne.utils.floatX(X_train)),
        y_train=T.cast(theano.shared(y_train), 'int32'),
        X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
        y_valid=T.cast(theano.shared(y_valid), 'int32'),
        X_test=theano.shared(lasagne.utils.floatX(X_test)),
        y_test=T.cast(theano.shared(y_test), 'int32'),
        num_examples_train=X_train.shape[0],
        num_examples_valid=X_valid.shape[0],
        num_examples_test=X_test.shape[0],
        #input_height=X_train.shape[2], # what's the equivalent in our vectors?
        #input_width=X_train.shape[3],
        output_dim=5, # since five sentiment class
        )
Example #5
0
    def load_data(self):
        data = self._load_data()
        X_train, y_train = data[0]
        X_valid, y_valid = data[1]
        X_test, y_test = data[2]

        # reshape for convolutions
        X_train = X_train.reshape((X_train.shape[0], 1, 28, 28))
        X_valid = X_valid.reshape((X_valid.shape[0], 1, 28, 28))
        X_test = X_test.reshape((X_test.shape[0], 1, 28, 28))

        return dict(
            X_train=theano.shared(lasagne.utils.floatX(X_train)),
            y_train=T.cast(theano.shared(y_train), 'int32'),
            X_valid=theano.shared(lasagne.utils.floatX(X_valid)),
            y_valid=T.cast(theano.shared(y_valid), 'int32'),
            valid_set = X_valid,
            y_valid_raw = y_valid,
            X_test=theano.shared(lasagne.utils.floatX(X_test)),
            y_test=T.cast(theano.shared(y_test), 'int32'),
            num_examples_train=X_train.shape[0],
            num_examples_valid=X_valid.shape[0],
            num_examples_test=X_test.shape[0],
            input_height=X_train.shape[2],
            input_width=X_train.shape[3],
            input_dim=[X_train.shape[2],X_train.shape[3]],
            output_dim=10,
            )
 def get_multi_loss(self, out1, out2, y1_batch, y2_batch):
     # TODO needs downsample for y
     #loss1 = pydnn.expr2d.logloss_2d( out1, T.cast(y1_batch, 'int32') )
     #loss2 = pydnn.expr2d.logloss_2d( out2, T.cast(y2_batch, 'int32') )
     loss1 = pydnn.expr2d.masked_logloss_2d( out1, T.cast(y1_batch, 'int32') )
     loss2 = pydnn.expr2d.masked_logloss_2d( out2, T.cast(y2_batch, 'int32') )
     return loss1+loss2
Example #7
0
def _transform_affine(theta, input, downsample_factor):
    num_batch, num_channels, height, width = input.shape
    theta = T.reshape(theta, (-1, 2, 3))

    # grid of (x_t, y_t, 1), eq (1) in ref [1]
    out_height = T.cast(height / downsample_factor[0], 'int64')
    out_width = T.cast(width / downsample_factor[1], 'int64')
    grid = _meshgrid(out_height, out_width)

    # Transform A x (x_t, y_t, 1)^T -> (x_s, y_s)
    T_g = T.dot(theta, grid)
    x_s = T_g[:, 0]
    y_s = T_g[:, 1]
    x_s_flat = x_s.flatten()
    y_s_flat = y_s.flatten()

    # dimshuffle input to  (bs, height, width, channels)
    input_dim = input.dimshuffle(0, 2, 3, 1)
    input_transformed = _interpolate(
        input_dim, x_s_flat, y_s_flat,
        out_height, out_width)

    output = T.reshape(
        input_transformed, (num_batch, out_height, out_width, num_channels))
    output = output.dimshuffle(0, 3, 1, 2)  # dimshuffle to conv format
    return output
Example #8
0
    def _step(self,y_tm1,yz_t, yr_t, yh_t,y_m,s_tm1,h,x_m):
        
                    
        # attention
        
        pctx__=T.dot(h,self.W_ha)+T.dot(s_tm1,self.W_sa)[None,:,:]
        
        pctx__=self.activation(pctx__)
        
        e=T.dot(pctx__,self.U_att)+self.b_att
        
        e=T.exp(e.reshape((e.shape[0],e.shape[1])))
        
        e=e/e.sum(0, keepdims=True)
        
        e=e*x_m
  
        c=(h*e[:,:,None]).sum(0)


        z = hard_sigmoid(yz_t + T.dot(s_tm1, self.U_z)+T.dot(c,self.W_cs))
        r = hard_sigmoid(yr_t + T.dot(s_tm1, self.U_r)+T.dot(c,self.W_cs))
        hh_t = self.activation(yh_t + T.dot(r * s_tm1, self.U_h)+T.dot(c,self.W_cy))
        s_t = z * s_tm1 + (1 - z) * hh_t
        
        s_t = (1. - y_m)[:,None] * s_tm1 + y_m[:,None] * s_t
        
        logit=self.activation(T.dot(s_t, self.W_hl)+T.dot(y_tm1, self.W_yl)+T.dot(c, self.W_cl))
        
        return T.cast(s_t,dtype =theano.config.floatX),T.cast(logit,dtype =theano.config.floatX)  
Example #9
0
    def get_monitoring_channels(self, model, X, Y = None):
        rval = OrderedDict()

        history = model.mf(X, return_history = True)
        q = history[-1]

        if self.supervised:
            assert Y is not None
            Y_hat = q[-1]
            true = T.argmax(Y,axis=1)
            pred = T.argmax(Y_hat, axis=1)

            #true = Print('true')(true)
            #pred = Print('pred')(pred)

            wrong = T.neq(true, pred)
            err = T.cast(wrong.mean(), X.dtype)
            rval['misclass'] = err

            if len(model.hidden_layers) > 1:
                q = model.mf(X, Y = Y)
                pen = model.hidden_layers[-2].upward_state(q[-2])
                Y_recons = model.hidden_layers[-1].mf_update(state_below = pen)
                pred = T.argmax(Y_recons, axis=1)
                wrong = T.neq(true, pred)

                rval['recons_misclass'] = T.cast(wrong.mean(), X.dtype)


        return rval
Example #10
0
	def __init__(self, dataset_path, batch_size=500, instance_weights_path=None):
		
		L.info("Initializing dataset from: " + os.path.abspath(dataset_path))
		
		# Reading parameters from the mmap file
		fp = np.memmap(dataset_path, dtype='int32', mode='r')
		self.num_samples = fp[0]
		self.ngram = fp[1]
		fp = fp.reshape((self.num_samples + 3, self.ngram))
		self.vocab_size = fp[1,0]
		self.num_classes = fp[2,0]

		# Setting minibatch size and number of mini batches
		self.batch_size = batch_size
		self.num_batches = int(M.ceil(self.num_samples / self.batch_size))
		
		# Reading the matrix of samples
		x = fp[3:,0:self.ngram - 1]			# Reading the context indices
		y = fp[3:,self.ngram - 1]			# Reading the output word index
		self.shared_x = T.cast(theano.shared(x, borrow=True), 'int32')
		self.shared_y = T.cast(theano.shared(y, borrow=True), 'int32')
		
		self.is_weighted = False
		if instance_weights_path:
			instance_weights = np.loadtxt(instance_weights_path)
			U.xassert(instance_weights.shape == (self.num_samples,), "The number of lines in weights file must be the same as the number of samples.")
			self.shared_w = T.cast(theano.shared(instance_weights, borrow=True), theano.config.floatX)
			self.is_weighted = True
		
		L.info('  #samples: %s, ngram size: %s, vocab size: %s, #classes: %s, batch size: %s, #batches: %s' % (
				U.red(self.num_samples), U.red(self.ngram), U.red(self.vocab_size), U.red(self.num_classes), U.red(self.batch_size), U.red(self.num_batches)
			)
		)
Example #11
0
def binarization(W,H,binary=True,deterministic=False,stochastic=False,srng=None):
    
    # (deterministic == True) <-> test-time <-> inference-time
    if not binary or (deterministic and stochastic):
        # print("not binary")
        Wb = W
    
    else:
        
        # [-1,1] -> [0,1]
        Wb = hard_sigmoid(W/H)
        # Wb = T.clip(W/H,-1,1)
        
        # Stochastic BinaryConnect
        if stochastic:
        
            # print("stoch")
            Wb = T.cast(srng.binomial(n=1, p=Wb, size=T.shape(Wb)), theano.config.floatX)

        # Deterministic BinaryConnect (round to nearest)
        else:
            # print("det")
            Wb = T.round(Wb)
        
        # 0 or 1 -> -1 or 1
        Wb = T.cast(T.switch(Wb,H,-H), theano.config.floatX)
    
    return Wb
Example #12
0
    def shared_dataset(data_xy, borrow=True):
        """ Function that loads the dataset into shared variables

        The reason we store our dataset in shared variables is to allow
        Theano to copy it into the GPU memory (when code is run on GPU).
        Since copying data into the GPU is slow, copying a minibatch
        everytime
        is needed (the default behaviour if the data is not in a shared
        variable) would lead to a large decrease in performance.
        """
        data_x, data_y = data_xy
        shared_x = theano.shared(np.asarray(data_x,
        dtype=theano.config.floatX),
        borrow=borrow)
        shared_y = theano.shared(np.asarray(data_y,
        dtype=theano.config.floatX),
        borrow=borrow)

        # one-hot encoded labels as {-1, 1}
        n_classes = len(np.unique(data_y)) # dangerous?
        y1 = -1 * np.ones((data_y.shape[0], n_classes))
        y1[np.arange(data_y.shape[0]), data_y] = 1
        shared_y1 = theano.shared(np.asarray(y1,
        dtype=theano.config.floatX),
        borrow=borrow)

        # When storing data on the GPU it has to be stored as floats
        # therefore we will store the labels as ``floatX`` as well
        # (``shared_y`` does exactly that). But during our computations
        # we need them as ints (we use labels as index, and if they are
        # floats it doesn't make sense) therefore instead of returning
        # ``shared_y`` we will have to cast it to int. This little hack
        # lets ous get around this issue
        return shared_x, T.cast(shared_y, 'int32'), T.cast(shared_y1,'int32')
Example #13
0
    def compute_hard_windows(self, image_shape, location, scale):
        # find topleft(front) and bottomright(back) corners for each patch
        a = location - 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale)
        b = location + 0.5 * (T.cast(self.patch_shape, theano.config.floatX) / scale)

        # grow by three patch pixels
        a -= self.kernel.k_sigma_radius(self.cutoff, scale)
        b += self.kernel.k_sigma_radius(self.cutoff, scale)

        # clip to fit inside image and have nonempty window
        a = T.clip(a, 0, image_shape - 1)
        b = T.clip(b, a + 1, image_shape)

        if self.batched_window:
            # take the bounding box of all windows; now the slices
            # will have the same length for each sample and scan can
            # be avoided.  comes at the cost of typically selecting
            # more of the input.
            a = a.min(axis=0, keepdims=True)
            b = b.max(axis=0, keepdims=True)

        # make integer
        a = T.cast(T.floor(a), 'int16')
        b = T.cast(T.ceil(b), 'int16')

        return a, b
Example #14
0
    def forward(self,input_org,train=True,update_batch_stat=True,finetune=False):
        print "Layer/BatchNormalization"
        ldim,cdim,rdim = self._internal_shape(input_org)
        input = input_org.reshape((ldim,cdim,rdim))
        if (train):
            mean = T.mean(input, axis=(0, 2), keepdims=True )
            var = T.mean((input-mean)**2, axis=(0, 2), keepdims=True)

            if(update_batch_stat):
                finetune_N = theano.clone(self.finetune_N, share_inputs=False)
                if(finetune):
                    finetune_N.default_update = finetune_N+1
                    ratio = T.cast(1-1.0/(finetune_N+1),theano.config.floatX)
                else:
                    finetune_N.default_update = 0
                    ratio = self.moving_avg_ratio
                m = ldim*rdim
                scale = T.cast(m/(m-1.0),theano.config.floatX)
                est_mean = theano.clone(self.est_mean, share_inputs=False)
                est_var = theano.clone(self.est_var, share_inputs=False)
                est_mean.default_update = T.cast(ratio*self.est_mean + (1-ratio)*mean,theano.config.floatX)
                est_var.default_update = T.cast(ratio*self.est_var + (1-ratio)*scale*var,theano.config.floatX)
                mean += 0 * est_mean
                var += 0 * est_var
            output = self._pbc(self.gamma) * (input - self._pbc(mean)) \
                     / T.sqrt(1e-6+self._pbc(var)) + self._pbc(self.beta)

        else:
            output = self._pbc(self.gamma) * (input - self._pbc(self.est_mean)) \
                     / T.sqrt(1e-6+self._pbc(self.est_var)) + self._pbc(self.beta)

        return output.reshape(input_org.shape)
Example #15
0
    def get_cost_grads_updates(self, x):

        ha, h = self.network.propup(x, noisestd=self.train_hypers['noise_std'])
        q = 0.9*self.q + 0.1*h.mean(axis=0)

        ### get correlation matrix for examples
        # C = T.dot(x.T, h) / x.shape[0]
        x_std = x.std(axis=0)
        h_std = h.std(axis=0)
        xz = (x - x.mean(0)) / (x.std(0) + 1e-2)
        hz = (h - h.mean(0)) / (h.std(0) + 1e-2)
        # C = T.dot(xz.T, hz) / x.shape[0]
        C = T.dot(xz.T, hz)

        lamb = T.cast(self.train_hypers['lamb'], self.dtype)
        rho = T.cast(self.train_hypers['rho'], self.dtype)
        # cost = (C**2).sum() + lamb*(T.abs_(q - rho)).sum()
        # cost = (C**2).sum() / x.shape[0]**2 + lamb*(T.abs_(q - rho)).sum()
        cost = (C**2).sum() / x.shape[0]**2 + lamb*((q - rho)**2).sum()

        # lamb = T.cast(self.train_hypers['lamb'], self.dtype)
        # rho = T.cast(self.train_hypers['rho'], self.dtype)
        # cost = ((x - y)**2).mean(axis=0).sum() + lamb*(T.abs_(q - rho)).sum()

        updates = {self.q: q}
        return cost, self.grads(cost), updates
Example #16
0
def local_gpua_advanced_incsubtensor(node, context_name):
    context = get_context(context_name)
    # This is disabled on non-cuda contexts
    if context.kind != 'cuda':
        return None

    x, y, ilist = node.inputs

    # Gpu Ops needs both inputs to have the same dtype
    if (x.type.dtype != y.type.dtype):
        dtype = scalar.upcast(x.type.dtype, y.type.dtype)
        if x.type.dtype != dtype:
            x = tensor.cast(x, dtype)
        if y.type.dtype != dtype:
            y = tensor.cast(y, dtype)

    set_instead_of_inc = node.op.set_instead_of_inc

    compute_capability = int(context.bin_id[-2])

    if (compute_capability < 2 or x.ndim != 2 or y.ndim != 2):
        return GpuAdvancedIncSubtensor1(
            set_instead_of_inc=set_instead_of_inc)
    else:
        return GpuAdvancedIncSubtensor1_dev20(
            set_instead_of_inc=set_instead_of_inc)
Example #17
0
def test_elemwise_composite_float64():
    # test that we don't fuse composite elemwise with float64 somewhere inside
    # nvcc by default downcast them to float32. We would need to tell him not
    # to do so, but that possible only on some device.
    a = tensor.fmatrix()
    b = tensor.fmatrix()
    av = theano._asarray(numpy.random.rand(4, 4), dtype='float32')
    bv = numpy.ones((4, 4), dtype='float32')

    def get_all_basic_scalar(composite_op):
        l = []
        for i in composite_op.env.toposort():
            if isinstance(i, theano.scalar.Composite):
                l += get_all_basic_scalar(i)
            else:
                l.append(i)
        return l
    for mode in [mode_with_gpu, mode_with_gpu.excluding('gpu_after_fusion'),
                 mode_with_gpu.excluding('elemwise_fusion')]:
        f = pfunc([a, b],
                  tensor.cast(tensor.lt(tensor.cast(a, 'float64') ** 2,
                                               b),
                                     'float32'), mode=mode)

        out = f(av, bv)
        assert numpy.all(out == ((av ** 2) < bv))
        for node in f.maker.env.toposort():
            if isinstance(node.op, cuda.GpuElemwise):
                if isinstance(node.op.scalar_op, theano.scalar.Composite):
                    scals = get_all_basic_scalar(node.op.scalar_op)
                    for s in scals:
                        assert not any([i.type.dtype == 'float64'
                                        for i in s.inputs + s.outputs])
    def f1_score(self, y, labels=[0, 2]):
      """
      Mean F1 score between two classes (positive and negative as specified by the labels array).
      """
      y_tr = y
      y_pr = self.y_pred

      correct = T.eq(y_tr, y_pr)
      wrong = T.neq(y_tr, y_pr)

      label = labels[0]
      tp_neg = T.sum(correct * T.eq(y_tr, label))
      fp_neg = T.sum(wrong * T.eq(y_pr, label))
      fn_neg = T.sum(T.eq(y_tr, label) * T.neq(y_pr, label))
      tp_neg = T.cast(tp_neg, theano.config.floatX)
      prec_neg = tp_neg / T.maximum(1, tp_neg + fp_neg)
      recall_neg = tp_neg / T.maximum(1, tp_neg + fn_neg)
      f1_neg = 2. * prec_neg * recall_neg / T.maximum(1, prec_neg + recall_neg)

      label = labels[1]
      tp_pos = T.sum(correct * T.eq(y_tr, label))
      fp_pos = T.sum(wrong * T.eq(y_pr, label))
      fn_pos = T.sum(T.eq(y_tr, label) * T.neq(y_pr, label))
      tp_pos = T.cast(tp_pos, theano.config.floatX)
      prec_pos = tp_pos / T.maximum(1, tp_pos + fp_pos)
      recall_pos = tp_pos / T.maximum(1, tp_pos + fn_pos)
      f1_pos = 2. * prec_pos * recall_pos / T.maximum(1, prec_pos + recall_pos)

      return 0.5 * (f1_pos + f1_neg) * 100
Example #19
0
    def __init__(self, input, image_shape, cropsize, rand, mirror, flag_rand):
        '''
        The random mirroring and cropping in this function is done for the
        whole batch.
        '''

        # trick for random mirroring
        mirror = input[:, :, ::-1, :]
        input = T.concatenate([input, mirror], axis=0)

        # crop images
        center_margin = (image_shape[2] - cropsize) / 2

        if flag_rand:
            mirror_rand = T.cast(rand[2], 'int32')
            crop_xs = T.cast(rand[0] * center_margin * 2, 'int32')
            crop_ys = T.cast(rand[1] * center_margin * 2, 'int32')
        else:
            mirror_rand = 0
            crop_xs = center_margin
            crop_ys = center_margin

        self.output = input[mirror_rand * 3:(mirror_rand + 1) * 3, :, :, :]
        self.output = self.output[
            :, crop_xs:crop_xs + cropsize, crop_ys:crop_ys + cropsize, :]

        print "data layer with shape_in: " + str(image_shape)
Example #20
0
  def cost(self):
    known_grads = None
    xd = self.z.reshape((self.z.shape[0]*self.z.shape[1],self.z.shape[2]))
    epsilon = numpy.float32(1e-10)
    # cross-entropy
    nll, _ = T.nnet.crossentropy_softmax_1hot(x=xd[self.i], y_idx=self.y_data_flat[self.i])
    ce = T.sum(nll)
    # entropy
    def entropy(p, axis=None):
      if self.use_max and axis is not None:
        q = p.dimshuffle(axis, *(range(axis) + range(axis+1,p.ndim)))
        #return -T.mean(T.log(T.maximum(T.max(q,axis=0),epsilon)))
        return -T.mean(T.max(q,axis=0)+epsilon) + T.log(T.cast(p.shape[axis],'float32'))
      else:
        return -T.mean(p*T.log(p+epsilon)) + T.log(T.cast(p.shape[axis],'float32'))
    ez = T.exp(self.z) * T.cast(self.index.dimshuffle(0,1,'x').repeat(self.z.shape[2],axis=2), 'float32')
    et = entropy(ez / T.maximum(epsilon,T.sum(ez,axis=0,keepdims=True)),axis=0)
    eb = entropy(ez / T.maximum(epsilon,T.sum(ez,axis=1,keepdims=True)),axis=1)
    ed = entropy(ez / T.maximum(epsilon,T.sum(ez,axis=2,keepdims=True)),axis=2)
    # maximize entropy across T and B and minimize entropy across D
    e = self.e_d * ed - (self.e_t * et + self.e_b * eb) / numpy.float32(self.e_t + self.e_b)

    import theano.ifelse
    if self.train_flag:
      return theano.ifelse.ifelse(T.cast(self.xflag,'int8'),e,ce), known_grads
    else:
      return ce, known_grads
Example #21
0
 def entropy(p, axis=None):
   if self.use_max and axis is not None:
     q = p.dimshuffle(axis, *(range(axis) + range(axis+1,p.ndim)))
     #return -T.mean(T.log(T.maximum(T.max(q,axis=0),epsilon)))
     return -T.mean(T.max(q,axis=0)+epsilon) + T.log(T.cast(p.shape[axis],'float32'))
   else:
     return -T.mean(p*T.log(p+epsilon)) + T.log(T.cast(p.shape[axis],'float32'))
Example #22
0
    def compute_crop_matrices(self, locations, scales, Is):
        Ws = []
        for axis in xrange(self.n_spatial_dims):
            m = T.cast(self.image_shape[axis], 'float32')
            n = T.cast(self.patch_shape[axis], 'float32')
            I = Is[axis].dimshuffle('x', 0, 'x')    # (1, hardcrop_dim, 1)
            J = T.arange(n).dimshuffle('x', 'x', 0) # (1, 1, patch_dim)

            location = locations[:, axis].dimshuffle(0, 'x', 'x')   # (batch_size, 1, 1)
            scale    = scales   [:, axis].dimshuffle(0, 'x', 'x')   # (batch_size, 1, 1)

            # map patch index into image index space
            J = (J - 0.5*n) / scale + location                      # (batch_size, 1, patch_dim)

            # compute squared distances between image index and patch
            # index in the current dimension:
            #   dx**2 = (i - j)*(i - j)
            #               where i is image index
            #                     j is patch index mapped into image space
            #         = i**2 + j**2 -2ij
            #         = I**2 + J**2 -2IJ'  for all i,j in one swoop

            IJ = I * J                # (batch_size, hardcrop_dim, patch_dim)
            dx2 = I**2 + J**2 - 2*IJ  # (batch_size, hardcrop_dim, patch_dim)

            Ws.append(self.kernel.density(dx2, scale))
        return Ws
Example #23
0
	def resample_step(self):
		
		idx=self.theano_rng.multinomial(pvals=T.reshape(self.weights_now,(1,self.npcl))).T
		s_samp=T.sum(self.s_now*T.addbroadcast(idx,1),axis=0)
		h_samp=T.sum(self.h_now*T.addbroadcast(idx,1),axis=0)
		
		return T.cast(s_samp,'float32'), T.cast(h_samp,'float32')
Example #24
0
def _transform(theta, input, downsample_factor):
    num_batch, num_channels, height, width = input.shape
    theta = T.reshape(theta, (-1, 1))

    # grid of (x_t, y_t, 1), eq (1) in ref [1]
    out_height = T.cast(height / downsample_factor[0], 'int64')
    out_width = T.cast(width / downsample_factor[1], 'int64')
    grid = _meshgrid(out_height, out_width)
   
    zeros = T.zeros_like(theta)
    padded_theta = T.concatenate([theta, zeros], axis=1)
    T_g = padded_theta.dimshuffle(0, 1, 'x') + grid.dimshuffle('x', 0, 1)

    x_s = T_g[:, 0]
    y_s = T_g[:, 1]
    x_s_flat = x_s.flatten()
    y_s_flat = y_s.flatten()

    # dimshuffle input to  (bs, height, width, channels)
    input_dim = input.dimshuffle(0, 2, 3, 1)
    input_transformed = _interpolate(
        input_dim, x_s_flat, y_s_flat,
        out_height, out_width)

    output = T.reshape(
        input_transformed, (num_batch, out_height, out_width, num_channels))
    output = output.dimshuffle(0, 3, 1, 2)  # dimshuffle to conv format
    return output
    def __build_backprop(self):

        y_init = self.outside_world.y_data_one_hot                    # initialize y=y_data
        h_init = my_op(2 * (T.dot(rho(y_init), self.W2.T) + self.bh)) # initialize h by backward propagation
        x_init = my_op(T.dot(rho(h_init), self.W1.T) + self.bx)       # initialize x by backward propagation

        Delta_y = y_init - self.y
        Delta_h = h_init - self.h
        Delta_x = x_init - self.x

        by_dot = T.mean(Delta_y, axis=0)
        W2_dot = T.dot(self.rho_h.T, Delta_y) / T.cast(self.x.shape[0], dtype=theano.config.floatX)
        bh_dot = T.mean(Delta_h, axis=0)
        W1_dot = T.dot(self.rho_x.T, Delta_h) / T.cast(self.x.shape[0], dtype=theano.config.floatX)
        bx_dot = T.mean(Delta_x, axis=0)

        alpha  = T.fscalar('alpha')
        by_new = self.by + alpha * by_dot
        W2_new = self.W2 + alpha * W2_dot
        bh_new = self.bh + alpha * bh_dot
        W1_new = self.W1 + alpha * W1_dot
        bx_new = self.bx + alpha * bx_dot
        
        updates_states = [(self.x, x_init), (self.h, h_init), (self.y, y_init)]
        updates_params = [(self.by, by_new), (self.W2, W2_new), (self.bh, bh_new), (self.W1, W1_new)]

        backprop = theano.function(
            inputs=[alpha],
            outputs=[],
            updates=updates_states+updates_params
        )

        return backprop
def compute_f_mu(x, t, params):
	[centers, spreads, biases, M, b]=params
	diffs=x.dimshuffle(0,1,2,'x')-centers.dimshuffle('x','x',0,1)
	scaled_diffs=(diffs**2)*T.exp(spreads).dimshuffle('x','x',0,1)
	exp_terms=T.sum(scaled_diffs,axis=2)+biases.dimshuffle('x','x',0)*0.0
	h=T.exp(-exp_terms)
	sumact=T.sum(h,axis=2)
	#Normalization
	hnorm=h/sumact.dimshuffle(0,1,'x')
	z=T.dot(hnorm,M)
	z=T.reshape(z,(t.shape[0],t.shape[1],ntgates,nx))+b.dimshuffle('x','x',0,1) #nt by nb by ntgates by nx
	#z=z+T.reshape(x,(t.shape[0],t.shape[1],1,nx))
	
	tpoints=T.cast(T.arange(ntgates),'float32')/T.cast(ntgates-1,'float32')
	tpoints=T.reshape(tpoints, (1,1,ntgates))
	#tgating=T.exp(T.dot(t,muWT)+mubT) #nt by nb by ntgates
	tgating=T.exp(-kT*(tpoints-t)**2)
	tgating=tgating/T.reshape(T.sum(tgating, axis=2),(t.shape[0], t.shape[1], 1))
	tgating=T.reshape(tgating,(t.shape[0],t.shape[1],ntgates,1))
	
	mult=z*tgating
	
	out=T.sum(mult,axis=2)
	
	#out=out+x
	
	return T.cast(out,'float32')
Example #27
0
    def _step(self, x_tm1, u_tm1, inputs, x_prior, u_prior, *args):
        # x_prior are previous states
        # u_prior are causes from above
        outputs = self.activation(T.dot(x_tm1, self.W))
        rec_error = T.sqr(inputs - outputs).sum()
        causes = (1 + T.exp(-T.dot(u_tm1, self.V))) * .5

        if self.pool_flag:
            batch_size = inputs.shape[0]
            dim = causes.shape[1]
            imgs = T.cast(T.sqrt(dim), 'int64')
            causes_up = causes.reshape(
                (batch_size, 1, imgs, imgs)).repeat(
                    self.pool_size, axis=2).repeat(self.pool_size,
                                                   axis=3).flatten(ndim=2)
        else:
            causes_up = causes

        x = _IstaStep(rec_error, x_tm1, lambdav=self.gamma*causes_up,
                      x_prior=x_prior)

        if self.pool_flag:
            dim = T.cast(T.sqrt(x.shape[1]), 'int64')
            x_pool = x.reshape((batch_size, 1, dim, dim))
            x_pool = max_pool_2d(x_pool, ds=(self.pool_size, )*2).flatten(ndim=2)
        else:
            x_pool = x

        prev_u_cost = .01 * self.gamma * T.sqr(u_tm1-u_prior).sum()
        u_cost = causes * abs(x_pool) * self.gamma + prev_u_cost
        u = _IstaStep(u_cost.sum(), u_tm1, lambdav=self.gamma)
        causes = (1 + T.exp(-T.dot(u, self.V))) * .5
        u_cost = causes * abs(x_pool) * self.gamma

        return (x, u, u_cost, outputs)
Example #28
0
    def get_monitoring_channels(self, model, data, **kwargs):
        rval = OrderedDict()

        space, sources = self.get_data_specs(model)
        X_data, X_condition = data
        m = X_data.shape[space.get_batch_axis()]

        G, D = model.generator, model.discriminator

        # Compute false negatives w/ empirical samples
        y_hat = D.fprop((X_data, X_condition))
        rval["false_negatives"] = T.cast((y_hat < 0.5).mean(), "float32")

        # Compute false positives w/ generated sample
        G_conditional_data = self.condition_distribution.sample(m)
        samples = G.sample(G_conditional_data)
        y_hat = D.fprop((samples, G_conditional_data))
        rval["false_positives"] = T.cast((y_hat > 0.5).mean(), "float32")

        # y = T.alloc(0., m, 1)
        cost = D.cost_from_X(((samples, G_conditional_data), y_hat))
        sample_grad = T.grad(-cost, samples)
        rval["sample_grad_norm"] = T.sqrt(T.sqr(sample_grad).sum())

        _S, d_obj, g_obj, i_obj = self.get_samples_and_objectives(model, data)
        if model.monitor_inference and i_obj != 0:
            rval["objective_i"] = i_obj
        if model.monitor_discriminator:
            rval["objective_d"] = d_obj
        if model.monitor_generator:
            rval["objective_g"] = g_obj

        rval["now_train_generator"] = self.now_train_generator
        return rval
def loss(x_0, n, t, params):
	muparams=params[:5]
	covparams=params[5:10]
	tpoints=T.cast(T.arange(nsteps),'float32')/T.cast(nsteps,'float32')
	betas=compute_betas(params[-1],tpoints)
	
	def step(nt, bt, xt):
		mean=xt*T.sqrt(1.0-bt)
		xnew=T.cast(mean+T.sqrt(bt)*nt,'float32')
		losst=T.cast(0.5*T.mean(T.sum((((mean-xnew)**2)/bt+T.log(np.pi*2.0*bt)),axis=1)),'float32')
		return xnew, losst
	
	[xhist, fwdlosshist],updates=theano.scan(fn=step,
								outputs_info=[x_0, None],
								sequences=[n, betas],
								n_steps=nsteps)
	
	
	forward_loss=-T.mean(fwdlosshist)+0.5*T.mean(T.sum((xhist[-1]**2+T.log(np.pi*2.0)),axis=1))
	
	#f_mu=compute_f_mu(xhist,t,muparams)
	#f_cov=compute_f_cov(xhist,t,covparams)
	#diffs=(f_mu[2:]-xhist[:-1])**2
	#gaussian_terms=T.sum(diffs*(1.0/f_cov[1:].dimshuffle(0,1,'x')),axis=2)
	#det_terms=T.sum(T.log(f_cov[1:].dimshuffle(0,1,'x')),axis=2)
	
	f_mu=compute_f_mu(xhist,t,muparams)+xhist*(T.sqrt(1.0-betas)).dimshuffle(0,'x','x')
	f_cov=compute_f_cov(xhist,t,covparams)*betas.dimshuffle(0,'x')
	xhist=T.concatenate([x_0.dimshuffle('x',0,1), xhist],axis=0)
	diffs=(f_mu-xhist[:-1])**2
	gaussian_terms=T.sum(diffs*(1.0/f_cov.dimshuffle(0,1,'x')),axis=2)
	det_terms=T.sum(T.log(f_cov.dimshuffle(0,1,'x')),axis=2)
	
	reverse_loss=T.mean(T.mean(gaussian_terms+det_terms))
	return reverse_loss+forward_loss
Example #30
0
 def cost(self):
   """
   :param y: shape (time*batch,) -> label
   :return: error scalar, known_grads dict
   """
   y_f = T.cast(T.reshape(self.y_data_flat, (self.y_data_flat.shape[0] * self.y_data_flat.shape[1]), ndim = 1), 'int32')
   known_grads = None
   if self.loss == 'sprint':
     if not isinstance(self.sprint_opts, dict):
       import json
       self.sprint_opts = json.loads(self.sprint_opts)
     assert isinstance(self.sprint_opts, dict), "you need to specify sprint_opts in the output layer"
     if self.exp_normalize:
       log_probs = T.log(self.p_y_given_x)
     else:
       log_probs = self.z
     sprint_error_op = SprintErrorSigOp(self.attrs.get("target", "classes"), self.sprint_opts)
     err, grad = sprint_error_op(log_probs, T.sum(self.index, axis=0))
     err = err.sum()
     if self.loss_like_ce:
       y_ref = T.clip(self.p_y_given_x - grad, numpy.float32(0), numpy.float32(1))
       err = -T.sum(T.log(T.pow(self.p_y_given_x, y_ref)) * T.cast(self.index, "float32").dimshuffle(0, 1, 'x'))
     if self.ce_smoothing:
       err *= numpy.float32(1.0 - self.ce_smoothing)
       grad *= numpy.float32(1.0 - self.ce_smoothing)
       if not self.prior_scale:  # we kept the softmax bias as it was
         nll, pcx = T.nnet.crossentropy_softmax_1hot(x=self.y_m[self.i], y_idx=self.y_data_flat[self.i])
       else:  # assume that we have subtracted the bias by the log priors beforehand
         assert self.log_prior is not None
         # In this case, for the CE calculation, we need to add the log priors again.
         y_m_prior = T.reshape(self.z + numpy.float32(self.prior_scale) * self.log_prior,
                               (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2)
         nll, pcx = T.nnet.crossentropy_softmax_1hot(x=y_m_prior[self.i], y_idx=self.y_data_flat[self.i])
       ce = numpy.float32(self.ce_smoothing) * T.sum(nll)
       err += ce
       grad += T.grad(ce, self.z)
     known_grads = {self.z: grad}
     return err, known_grads
   elif self.loss == 'ctc':
     from theano.tensor.extra_ops import cpu_contiguous
     err, grad, priors = CTCOp()(self.p_y_given_x, cpu_contiguous(self.y.dimshuffle(1, 0)), self.index_for_ctc())
     known_grads = {self.z: grad}
     return err.sum(), known_grads, priors.sum(axis=0)
   elif self.loss == 'ce_ctc':
     y_m = T.reshape(self.z, (self.z.shape[0] * self.z.shape[1], self.z.shape[2]), ndim=2)
     p_y_given_x = T.nnet.softmax(y_m)
     #pcx = p_y_given_x[(self.i > 0).nonzero(), y_f[(self.i > 0).nonzero()]]
     pcx = p_y_given_x[self.i, self.y_data_flat[self.i]]
     ce = -T.sum(T.log(pcx))
     return ce, known_grads
   elif self.loss == 'ctc2':
     from NetworkCtcLayer import ctc_cost, uniq_with_lengths, log_sum
     max_time = self.z.shape[0]
     num_batches = self.z.shape[1]
     time_mask = self.index.reshape((max_time, num_batches))
     y_batches = self.y_data_flat.reshape((max_time, num_batches))
     targets, seq_lens = uniq_with_lengths(y_batches, time_mask)
     log_pcx = self.z - log_sum(self.z, axis=0, keepdims=True)
     err = ctc_cost(log_pcx, time_mask, targets, seq_lens)
     return err, known_grads
Example #31
0
    def __init__(self,
                 d_v,
                 d_e,
                 d_t,
                 optimizer,
                 optimizer_args,
                 np_rng,
                 th_rng,
                 n_classes=0,
                 encoder_layers=1,
                 generator_layers=0,
                 generator_transform=None,
                 use_interactions=False,
                 clip_gradients=False,
                 init_bias=None,
                 train_bias=False,
                 scale=6.0,
                 encode_labels=False,
                 l1_inter_factor=1.0,
                 time_penalty=False,
                 encoder_shortcut=False,
                 generator_shortcut=False):

        self.d_v = d_v  # vocabulary size
        self.d_e = d_e  # dimensionality of encoder
        self.d_t = d_t  # number of topics
        self.n_classes = n_classes  # number of classes
        assert encoder_layers == 1 or encoder_layers == 2
        self.n_encoder_layers = encoder_layers
        assert generator_layers == 0 or generator_layers == 1 or generator_layers == 2 or generator_layers == 4
        self.n_generator_layers = generator_layers

        # set various options
        self.generator_transform = generator_transform  # transform to apply after the generator
        self.use_interactions = use_interactions  # use interactions between topics and labels
        self.encode_labels = encode_labels  # feed labels into the encoder
        self.l1_inter_factor = l1_inter_factor  # factor by which to multiply L1 penalty on interactions
        self.encoder_shortcut = encoder_shortcut
        self.generator_shortcut = generator_shortcut

        # create parameter matrices and biases
        self.W_encoder_1 = common_theano.init_param('W_encoder_1', (d_e, d_v),
                                                    np_rng,
                                                    scale=scale)
        self.b_encoder_1 = common_theano.init_param('b_encoder_1', (d_e, ),
                                                    np_rng,
                                                    scale=0.0)

        if n_classes > 1:
            self.W_encoder_label = common_theano.init_param('W_encoder_label',
                                                            (d_e, n_classes),
                                                            np_rng,
                                                            scale=scale)
        else:
            self.W_encoder_label = common_theano.init_param(
                'W_encoder_label', (d_e, n_classes),
                np_rng,
                values=np.zeros((d_e, n_classes), dtype=np.float32))

        self.W_encoder_2 = common_theano.init_param('W_encoder_2', (d_e, d_e),
                                                    np_rng,
                                                    scale=scale)
        self.b_encoder_2 = common_theano.init_param('b_encoder_2', (d_e, ),
                                                    np_rng,
                                                    scale=0.0)

        self.W_encoder_shortcut = common_theano.init_param(
            'W_encoder_shortcut', (d_e, d_v), np_rng, scale=scale)

        self.W_mu = common_theano.init_param('W_mu', (d_t, d_e),
                                             np_rng,
                                             scale=scale)
        self.b_mu = common_theano.init_param('b_mu', (d_t, ),
                                             np_rng,
                                             scale=0.0)

        self.W_sigma = common_theano.init_param('W_sigma', (d_t, d_e),
                                                np_rng,
                                                scale=scale,
                                                values=np.zeros((d_t, d_e)))
        self.b_sigma = common_theano.init_param('b_sigma', (d_t, ),
                                                np_rng,
                                                scale=0.0,
                                                values=np.array([-4] * d_t))

        self.W_generator_1 = common_theano.init_param('W_generator_1',
                                                      (d_t, d_t),
                                                      np_rng,
                                                      scale=scale)
        self.b_generator_1 = common_theano.init_param('b_generator_1', (d_t, ),
                                                      np_rng,
                                                      scale=0.0)

        self.W_generator_2 = common_theano.init_param('W_generator_2',
                                                      (d_t, d_t),
                                                      np_rng,
                                                      scale=scale)
        self.b_generator_2 = common_theano.init_param('b_generator_2', (d_t, ),
                                                      np_rng,
                                                      scale=0.0)

        self.W_generator_3 = common_theano.init_param('W_generator_3',
                                                      (d_t, d_t),
                                                      np_rng,
                                                      scale=scale)
        self.b_generator_3 = common_theano.init_param('b_generator_3', (d_t, ),
                                                      np_rng,
                                                      scale=0.0)

        self.W_generator_4 = common_theano.init_param('W_generator_4',
                                                      (d_t, d_t),
                                                      np_rng,
                                                      scale=scale)
        self.b_generator_4 = common_theano.init_param('b_generator_4', (d_t, ),
                                                      np_rng,
                                                      scale=0.0)

        self.W_decoder = common_theano.init_param('W_decoder', (d_v, d_t),
                                                  np_rng,
                                                  scale=scale)
        self.b_decoder = common_theano.init_param('b_decoder', (d_v, ),
                                                  np_rng,
                                                  scale=0.0)

        self.W_decoder_label = common_theano.init_param('W_decoder_label',
                                                        (d_v, n_classes),
                                                        np_rng,
                                                        scale=scale)
        self.W_decoder_inter = common_theano.init_param('W_decoder_inter',
                                                        (d_v, d_t * n_classes),
                                                        np_rng,
                                                        scale=scale)

        # set the decoder bias to the background frequency
        if init_bias is not None:
            self.b_decoder = common_theano.init_param('b_decoder', (d_v, ),
                                                      np_rng,
                                                      values=init_bias)

        # create basic sets of parameters which we will use to tell the model what to update
        self.params = [
            self.W_encoder_1, self.b_encoder_1, self.W_mu, self.b_mu,
            self.W_sigma, self.b_sigma, self.W_decoder
        ]
        self.param_shapes = [(d_e, d_v), (d_e, ), (d_t, d_e), (d_t, ),
                             (d_t, d_e), (d_t, ), (d_v, d_t)]

        self.encoder_params = [
            self.W_encoder_1, self.b_encoder_1, self.W_mu, self.b_mu,
            self.W_sigma, self.b_sigma
        ]
        self.encoder_param_shapes = [(d_e, d_v), (d_e, ), (d_t, d_e), (d_t, ),
                                     (d_t, d_e), (d_t, )]

        self.generator_params = []
        self.generator_param_shapes = []

        # add additional parameters to sets, depending on configuration
        if train_bias:
            self.params.append(self.b_decoder)
            self.param_shapes.append((d_v, ))
            self.decoder_params = [self.W_decoder, self.b_decoder]
            self.decoder_param_shapes = [(d_v, d_t), (d_v, )]
        else:
            self.decoder_params = [self.W_decoder]
            self.decoder_param_shapes = [(d_v, d_t)]

        # add parameters for labels (covariates)
        if self.n_classes > 1:
            self.params.append(self.W_decoder_label)
            self.param_shapes.append((d_v, n_classes))
            self.decoder_params.extend([self.W_decoder_label])
            self.decoder_param_shapes.extend([(d_v, n_classes)])
            if use_interactions:
                self.params.append(self.W_decoder_inter)
                self.param_shapes.append((d_v, d_t * n_classes))
                self.decoder_params.extend([self.W_decoder_inter])
                self.decoder_param_shapes.extend([(d_v, d_t * n_classes)])
            if encode_labels:
                self.params.append(self.W_encoder_label)
                self.param_shapes.append((d_e, n_classes))
                self.encoder_params.extend([self.W_encoder_label])
                self.encoder_param_shapes.extend([(d_e, n_classes)])
        self.label_only_params = [self.W_decoder_label]
        self.label_only_param_shapes = [(d_v, n_classes)]

        # add encoder parameters depending on number of layers
        if self.n_encoder_layers > 1:
            self.params.extend([self.W_encoder_2, self.b_encoder_2])
            self.param_shapes.extend([(d_e, d_e), (d_e, )])
            self.encoder_params.extend([self.W_encoder_2, self.b_encoder_2])
            self.encoder_param_shapes.extend([(d_e, d_e), (d_e, )])
        if self.encoder_shortcut:
            self.params.extend([self.W_encoder_shortcut])
            self.param_shapes.extend([(d_e, d_v)])
            self.encoder_params.extend([self.W_encoder_shortcut])
            self.encoder_param_shapes.extend([(d_e, d_v)])

        # add generator parameters depending on number of layers
        if self.n_generator_layers > 0:
            self.params.extend([self.W_generator_1, self.b_generator_1])
            self.param_shapes.extend([(d_t, d_t), (d_t, )])
            self.generator_params.extend(
                [self.W_generator_1, self.b_generator_1])
            self.generator_param_shapes.extend([(d_t, d_t), (d_t, )])

        if self.n_generator_layers > 1:
            self.params.extend([self.W_generator_2, self.b_generator_2])
            self.param_shapes.extend([(d_t, d_t), (d_t, )])
            self.generator_params.extend(
                [self.W_generator_2, self.b_generator_2])
            self.generator_param_shapes.extend([(d_t, d_t), (d_t, )])

        if self.n_generator_layers > 2:
            self.params.extend([
                self.W_generator_3, self.b_generator_3, self.W_generator_4,
                self.b_generator_4
            ])
            self.param_shapes.extend([(d_t, d_t), (d_t, ), (d_t, d_t),
                                      (d_t, )])
            self.generator_params.extend([
                self.W_generator_3, self.b_generator_3, self.W_generator_4,
                self.b_generator_4
            ])
            self.generator_param_shapes.extend([(d_t, d_t), (d_t, ),
                                                (d_t, d_t), (d_t, )])

        # declare variables that will be given as inputs to functions to be declared below
        x = T.vector('x', dtype=theano.config.floatX
                     )  # normalized vector of counts for one item
        y = T.vector(
            'y', dtype=theano.config.floatX)  # vector of labels for one item
        indices = T.ivector(
            'x')  # vector of vocab indices (easier to evaluate log prob)
        lr = T.fscalar('lr')  # learning rate
        l1_strength = T.fscalar('l1_strength')  # l1_strength
        kl_strength = T.fscalar('kl_strength')  # l1_strength

        n_words = T.shape(indices)
        # the two variables below are just for debugging
        n_words_print = theano.printing.Print('n_words')(
            T.shape(indices)[0])  # for debugging
        x_sum = theano.printing.Print('x_sum')(T.sum(x))  # for debugging

        # encode one item to mean and variance vectors
        mu, log_sigma_sq = self.encoder(x, y)

        # take a random sample from the corresponding multivariate normal
        h = self.sampler(mu, log_sigma_sq, th_rng)

        # compute the KL divergence from the prior
        KLD = -0.5 * T.sum(1 + log_sigma_sq - T.square(mu) -
                           T.exp(log_sigma_sq))

        # generate a document representation of dimensionality == n_topics
        r = self.generator(h)

        # decode back into a distribution over the vocabulary
        p_x_given_h = self.decoder(r, y)

        # evaluate the likelihood
        nll_term = -T.sum(
            T.log(p_x_given_h[T.zeros(n_words, dtype='int32'), indices]) +
            1e-32)

        # compute the loss
        loss = nll_term + KLD * kl_strength

        # add an L1 penalty to the decoder terms
        if time_penalty and n_classes > 1:
            penalty = common_theano.col_diff_L1(l1_strength,
                                                self.W_decoder_label,
                                                n_classes)
        else:
            penalty = common_theano.L1(l1_strength, self.W_decoder)
            if n_classes > 1:
                penalty += common_theano.L1(l1_strength, self.W_decoder_label)
                if use_interactions:
                    penalty += common_theano.L1(
                        l1_strength * self.l1_inter_factor,
                        self.W_decoder_inter)

        # declare some alternate function for decoding from the mean
        r_mu = self.generator(mu)
        p_x_given_x = self.decoder(r_mu, y)
        nll_term_mu = -T.sum(
            T.log(p_x_given_x[T.zeros(n_words, dtype='int32'), indices]) +
            1e-32)

        # declare some alternate functions for pretraining from a fixed document representation (r)
        pretrain_r = T.vector('pretrain_r', dtype=theano.config.floatX)
        p_x_given_pretrain_h = self.decoder(pretrain_r, y)
        pretrain_loss = -T.sum(
            T.log(p_x_given_pretrain_h[T.zeros(n_words, dtype='int32'),
                                       indices]) + 1e-32)

        # declare some alternate functions for only using labels
        p_x_given_y_only = self.decoder_label_only(y)
        nll_term_y_only = -T.sum(
            T.log(p_x_given_y_only[T.zeros(n_words, dtype='int32'), indices]) +
            1e-32)

        # compute gradients
        gradients = [
            T.cast(T.grad(loss + penalty, param, disconnected_inputs='warn'),
                   dtype=theano.config.floatX) for param in self.params
        ]
        encoder_gradients = [
            T.cast(T.grad(loss, param, disconnected_inputs='warn'),
                   dtype=theano.config.floatX) for param in self.encoder_params
        ]
        generator_gradients = [
            T.cast(T.grad(loss, param, disconnected_inputs='warn'),
                   dtype=theano.config.floatX)
            for param in self.generator_params
        ]
        decoder_gradients = [
            T.cast(T.grad(loss + penalty, param, disconnected_inputs='warn'),
                   dtype=theano.config.floatX) for param in self.decoder_params
        ]
        pretrain_gradients = [
            T.cast(T.grad(pretrain_loss + penalty,
                          param,
                          disconnected_inputs='warn'),
                   dtype=theano.config.floatX) for param in self.decoder_params
        ]
        label_only_gradients = [
            T.cast(T.grad(nll_term_y_only + penalty,
                          param,
                          disconnected_inputs='warn'),
                   dtype=theano.config.floatX)
            for param in self.label_only_params
        ]

        # optionally clip gradients
        if clip_gradients:
            gradients = common_theano.clip_gradients(gradients, 5)
            encoder_gradients = common_theano.clip_gradients(
                encoder_gradients, 5)
            generator_gradients = common_theano.clip_gradients(
                generator_gradients, 5)
            decoder_gradients = common_theano.clip_gradients(
                decoder_gradients, 5)
            pretrain_gradients = common_theano.clip_gradients(
                pretrain_gradients, 5)
            label_only_gradients = common_theano.clip_gradients(
                label_only_gradients, 5)

        # create the updates for various sets of parameters
        updates = optimizer(self.params, self.param_shapes, gradients, lr,
                            optimizer_args)
        encoder_updates = optimizer(self.encoder_params,
                                    self.encoder_param_shapes,
                                    encoder_gradients, lr, optimizer_args)
        generator_updates = optimizer(self.generator_params,
                                      self.generator_param_shapes,
                                      generator_gradients, lr, optimizer_args)
        decoder_updates = optimizer(self.decoder_params,
                                    self.decoder_param_shapes,
                                    decoder_gradients, lr, optimizer_args)
        other_updates = optimizer(
            self.encoder_params + self.generator_params,
            self.encoder_param_shapes + self.generator_param_shapes,
            encoder_gradients + generator_gradients, lr, optimizer_args)
        pretrain_updates = optimizer(self.decoder_params,
                                     self.decoder_param_shapes,
                                     pretrain_gradients, lr, optimizer_args)
        label_only_updates = optimizer(self.label_only_params,
                                       self.label_only_param_shapes,
                                       label_only_gradients, lr,
                                       optimizer_args)

        # declare the available methods for this class
        self.test_input = theano.function(inputs=[x, indices],
                                          outputs=[n_words_print, x_sum])
        self.train = theano.function(
            inputs=[x, indices, y, lr, l1_strength, kl_strength],
            outputs=[nll_term, KLD, penalty],
            updates=updates,
            on_unused_input='ignore')
        self.train_encoder = theano.function(
            inputs=[x, indices, y, lr, l1_strength, kl_strength],
            outputs=[nll_term, KLD, penalty],
            updates=encoder_updates,
            on_unused_input='ignore')
        self.train_generator = theano.function(
            inputs=[x, indices, y, lr, l1_strength, kl_strength],
            outputs=[nll_term, KLD, penalty],
            updates=generator_updates,
            on_unused_input='ignore')
        self.train_decoder = theano.function(
            inputs=[x, indices, y, lr, l1_strength, kl_strength],
            outputs=[nll_term, KLD, penalty],
            updates=decoder_updates,
            on_unused_input='ignore')
        self.train_not_decoder = theano.function(
            inputs=[x, indices, y, lr, l1_strength, kl_strength],
            outputs=[nll_term, KLD, penalty],
            updates=other_updates,
            on_unused_input='ignore')
        self.pretrain_decoder = theano.function(
            inputs=[indices, y, pretrain_r, lr, l1_strength, kl_strength],
            outputs=[pretrain_loss],
            updates=pretrain_updates,
            on_unused_input='ignore')
        self.encode = theano.function(inputs=[x, y],
                                      outputs=[mu, log_sigma_sq],
                                      on_unused_input='ignore')
        self.decode = theano.function(inputs=[pretrain_r, y],
                                      outputs=[p_x_given_pretrain_h],
                                      on_unused_input='ignore')
        self.sample = theano.function(inputs=[x, y],
                                      outputs=h,
                                      on_unused_input='ignore')
        self.get_mean_doc_rep = theano.function(inputs=[x, y],
                                                outputs=r_mu,
                                                on_unused_input='ignore')
        self.encode_and_decode = theano.function(inputs=[x, y],
                                                 outputs=p_x_given_x,
                                                 on_unused_input='ignore')
        self.neg_log_likelihood = theano.function(inputs=[x, indices, y],
                                                  outputs=[nll_term, KLD],
                                                  on_unused_input='ignore')
        self.neg_log_likelihood_mu = theano.function(
            inputs=[x, indices, y],
            outputs=[nll_term_mu, KLD],
            on_unused_input='ignore')
        self.train_label_only = theano.function(
            inputs=[indices, y, lr, l1_strength],
            outputs=[nll_term_y_only, penalty],
            updates=label_only_updates)
        self.neg_log_likelihood_label_only = theano.function(
            inputs=[indices, y], outputs=nll_term_y_only)
Example #32
0
def dropout_layer(layer, p_dropout):
    srng = shared_randomstreams.RandomStreams(
        np.random.RandomState(0).randint(999999))
    mask = srng.binomial(n=1, p=1 - p_dropout, size=layer.shape)
    return layer * T.cast(mask, theano.config.floatX)
Example #33
0
def max_pool_3d(input, ds, ignore_border=False):
    """
    Takes as input a N-D tensor, where N >= 3. It downscales the input video by
    the specified factor, by keeping only the maximum value of non-overlapping
    patches of size (ds[0],ds[1],ds[2]) (time, height, width)

    :type input: N-D theano tensor of input images.
    :param input: input images. Max pooling will be done over the 3 last dimensions.
    :type ds: tuple of length 3
    :param ds: factor by which to downscale. (2,2,2) will halve the video in each dimension.
    :param ignore_border: boolean value. When True, (5,5,5) input with ds=(2,2,2) will generate a
      (2,2,2) output. (3,3,3) otherwise.
    """

    if input.ndim < 3:
        raise NotImplementedError('max_pool_3d requires a dimension >= 3')

    # extract nr dimensions
    vid_dim = input.ndim
    # max pool in two different steps, so we can use the 2d implementation of
    # downsamplefactormax. First maxpool frames as usual.
    # Then maxpool the time dimension. Shift the time dimension to the third
    # position, so rows and cols are in the back

    # extract dimensions
    frame_shape = input.shape[-2:]

    # count the number of "leading" dimensions, store as dmatrix
    batch_size = T.prod(input.shape[:-2])
    batch_size = T.shape_padright(batch_size, 1)

    # store as 4D tensor with shape: (batch_size,1,height,width)
    new_shape = T.cast(T.join(0, batch_size, T.as_tensor([
        1,
    ]), frame_shape), 'int32')
    input_4D = T.reshape(input, new_shape, ndim=4)

    # downsample mini-batch of videos in rows and cols
    output = T.signal.pool.pool_2d(input_4D, (ds[1], ds[2]), ignore_border)
    # restore to original shape
    outshape = T.join(0, input.shape[:-2], output.shape[-2:])
    out = T.reshape(output, outshape, ndim=input.ndim)

    # now maxpool time

    # output (time, rows, cols), reshape so that time is in the back
    shufl = (list(range(vid_dim - 3)) + [vid_dim - 2] + [vid_dim - 1] +
             [vid_dim - 3])
    input_time = out.dimshuffle(shufl)
    # reset dimensions
    vid_shape = input_time.shape[-2:]

    # count the number of "leading" dimensions, store as dmatrix
    batch_size = T.prod(input_time.shape[:-2])
    batch_size = T.shape_padright(batch_size, 1)

    # store as 4D tensor with shape: (batch_size,1,width,time)
    new_shape = T.cast(T.join(0, batch_size, T.as_tensor([
        1,
    ]), vid_shape), 'int32')
    input_4D_time = T.reshape(input_time, new_shape, ndim=4)
    # downsample mini-batch of videos in time
    outtime = T.signal.pool.pool_2d(input_4D_time, (1, ds[0]), ignore_border)
    # output
    # restore to original shape (xxx, rows, cols, time)
    outshape = T.join(0, input_time.shape[:-2], outtime.shape[-2:])
    shufl = (list(range(vid_dim - 3)) + [vid_dim - 1] + [vid_dim - 3] +
             [vid_dim - 2])
    return T.reshape(outtime, outshape, ndim=input.ndim).dimshuffle(shufl)
Example #34
0
update = []

# shared variables
learning_rate = shared(float32(lr.init))
if use.mom:
    momentum = shared(float32(mom.momentum))
    drop.p_vid = shared(float32(drop.p_vid_val))
    drop.p_hidden = shared(float32(drop.p_hidden_val))

idx_mini = T.lscalar(name="idx_mini")  # minibatch index
idx_micro = T.lscalar(name="idx_micro")  # microbatch index
x = ndtensor(len(tr.in_shape))(name='x')  # video input
y = T.ivector(name='y')  # labels
x_ = _shared(empty(tr.in_shape))
y_ = _shared(empty(tr.batch_size))
y_int32 = T.cast(y_, 'int32')

# in shape: #frames * gray/depth * body/hand * 4 maps
import cPickle
f = open(os.path.join(load_path, 'SK_normalization.pkl'), 'rb')
SK_normalization = cPickle.load(f)
Mean1 = SK_normalization['Mean1']
Std1 = SK_normalization['Std1']

f = open('CNN_normalization.pkl', 'rb')
CNN_normalization = cPickle.load(f)
Mean_CNN = CNN_normalization['Mean_CNN']
Std_CNN = CNN_normalization['Std_CNN']

# customized data loader for both video module and skeleton module
loader = DataLoader_with_skeleton_normalisation(
Example #35
0
    def __init__(self, \
            rng=None, \
            Xd=None, \
            prior_sigma=None, \
            params=None, \
            shared_param_dicts=None):
        # Setup a shared random generator for this network 
        self.rng = RandStream(rng.randint(1000000))
        # Grab the symbolic input matrix
        self.Xd = Xd
        self.prior_sigma = prior_sigma
        #####################################################
        # Process user-supplied parameters for this network #
        #####################################################
        self.params = params
        self.lam_l2a = params['lam_l2a']
        if 'build_theano_funcs' in params:
            self.build_theano_funcs = params['build_theano_funcs']
        else:
            self.build_theano_funcs = True
        if 'vis_drop' in params:
            self.vis_drop = params['vis_drop']
        else:
            self.vis_drop = 0.0
        if 'hid_drop' in params:
            self.hid_drop = params['hid_drop']
        else:
            self.hid_drop = 0.0
        if 'input_noise' in params:
            self.input_noise = params['input_noise']
        else:
            self.input_noise = 0.0
        if 'bias_noise' in params:
            self.bias_noise = params['bias_noise']
        else:
            self.bias_noise = 0.0
        if 'init_scale' in params:
            self.init_scale = params['init_scale']
        else:
            self.init_scale = 1.0
        if 'encoder' in params:
            self.encoder = params['encoder']
            self.decoder = params['decoder']
            self.use_encoder = True
            self.Xd_encoded = self.encoder(self.Xd)
        else:
            self.encoder = lambda x: x
            self.decoder = lambda x: x
            self.use_encoder = False
            self.Xd_encoded = self.encoder(self.Xd)
        if 'kld2_scale' in params:
            self.kld2_scale = params['kld2_scale']
        else:
            self.kld2_scale = 0.0
        if 'sigma_init_scale' in params:
            self.sigma_init_scale = params['sigma_init_scale']
        else:
            self.sigma_init_scale = 1.0
        # Check if the params for this net were given a priori. This option
        # will be used for creating "clones" of an inference network, with all
        # of the network parameters shared between clones.
        if shared_param_dicts is None:
            # This is not a clone, and we will need to make a dict for
            # referring to the parameters of each network layer
            self.shared_param_dicts = {'shared': [], 'mu': [], 'sigma': []}
            self.is_clone = False
        else:
            # This is a clone, and its layer parameters can be found by
            # referring to the given param dict (i.e. shared_param_dicts).
            self.shared_param_dicts = shared_param_dicts
            self.is_clone = True
        # Get the configuration/prototype for this network. The config is a
        # list of layer descriptions, including a description for the input
        # layer, which is typically just the dimension of the inputs. So, the
        # depth of the mlp is one less than the number of layer configs.
        self.shared_config = params['shared_config']
        self.mu_config = params['mu_config']
        self.sigma_config = params['sigma_config']
        if 'activation' in params:
            self.activation = params['activation']
        else:
            self.activation = relu_actfun
        #########################################
        # Initialize the shared part of network #
        #########################################
        self.shared_layers = []
        layer_def_pairs = zip(self.shared_config[:-1],self.shared_config[1:])
        layer_num = 0
        # Construct input to the inference network
        if self.use_encoder:
            next_input = self.encoder(self.Xd)
        else:
            next_input = self.Xd
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "share_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            if first_layer:
                d_rate = self.vis_drop
            else:
                d_rate = self.hid_drop
            if first_layer:
                i_noise = self.input_noise
                b_noise = 0.0
            else:
                i_noise = 0.0
                b_noise = self.bias_noise
            # set in-bound weights to have norm self.init_scale
            i_scale = self.init_scale
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=i_scale)
                self.shared_layers.append(new_layer)
                self.shared_param_dicts['shared'].append( \
                        {'W': new_layer.W, 'b': new_layer.b, \
                         'b_in': new_layer.b_in, 's_in': new_layer.s_in})
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts['shared'][layer_num]
                if not (('b_in' in init_params) and ('s_in' in init_params)):
                    init_params['b_in'] = None
                    init_params['s_in'] = None
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        b_in=init_params['b_in'], s_in=init_params['s_in'], \
                        name=l_name, W_scale=i_scale)
                self.shared_layers.append(new_layer)
                if ((init_params['b_in'] is None) or (init_params['s_in'] is None)):
                    init_params['b_in'] = new_layer.b_in
                    init_params['s_in'] = new_layer.s_in
            next_input = self.shared_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1
        #####################################
        # Initialize the mu part of network #
        #####################################
        self.mu_layers = []
        layer_def_pairs = zip(self.mu_config[:-1],self.mu_config[1:])
        layer_num = 0
        # Take input from the output of the shared network
        next_input = self.shared_layers[-1].output
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "mu_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            d_rate = self.hid_drop
            i_noise = 0.0
            b_noise = self.bias_noise
            # set in-bound weights to have norm self.init_scale
            i_scale = self.init_scale
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=i_scale)
                self.mu_layers.append(new_layer)
                self.shared_param_dicts['mu'].append( \
                        {'W': new_layer.W, 'b': new_layer.b, \
                         'b_in': new_layer.b_in, 's_in': new_layer.s_in})
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts['mu'][layer_num]
                if not (('b_in' in init_params) and ('s_in' in init_params)):
                    init_params['b_in'] = None
                    init_params['s_in'] = None
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        b_in=init_params['b_in'], s_in=init_params['s_in'], \
                        name=l_name, W_scale=i_scale)
                self.mu_layers.append(new_layer)
                if ((init_params['b_in'] is None) or (init_params['s_in'] is None)):
                    init_params['b_in'] = new_layer.b_in
                    init_params['s_in'] = new_layer.s_in
            next_input = self.mu_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1
        ########################################
        # Initialize the sigma part of network #
        ########################################
        self.sigma_layers = []
        layer_def_pairs = zip(self.sigma_config[:-1],self.sigma_config[1:])
        layer_num = 0
        # Take input from the output of the shared network
        next_input = self.shared_layers[-1].output
        for in_def, out_def in layer_def_pairs:
            first_layer = (layer_num == 0)
            last_layer = (layer_num == (len(layer_def_pairs) - 1))
            l_name = "sigma_layer_{0:d}".format(layer_num)
            if (type(in_def) is list) or (type(in_def) is tuple):
                # Receiving input from a poolish layer...
                in_dim = in_def[0]
            else:
                # Receiving input from a normal layer...
                in_dim = in_def
            if (type(out_def) is list) or (type(out_def) is tuple):
                # Applying some sort of pooling in this layer...
                out_dim = out_def[0]
                pool_size = out_def[1]
            else:
                # Not applying any pooling in this layer...
                out_dim = out_def
                pool_size = 0
            # Select the appropriate noise to add to this layer
            d_rate = self.hid_drop
            i_noise = 0.0
            b_noise = self.bias_noise
            # set in-bound weights to have norm self.init_scale
            i_scale = self.init_scale
            if last_layer:
                # set in-bound weights for logvar predictions to 0
                i_scale = 0.0 * i_scale
            if not self.is_clone:
                ##########################################
                # Initialize a layer with new parameters #
                ##########################################
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        name=l_name, W_scale=i_scale)
                self.sigma_layers.append(new_layer)
                self.shared_param_dicts['sigma'].append( \
                        {'W': new_layer.W, 'b': new_layer.b, \
                         'b_in': new_layer.b_in, 's_in': new_layer.s_in})
            else:
                ##################################################
                # Initialize a layer with some shared parameters #
                ##################################################
                init_params = self.shared_param_dicts['sigma'][layer_num]
                if not (('b_in' in init_params) and ('s_in' in init_params)):
                    init_params['b_in'] = None
                    init_params['s_in'] = None
                new_layer = HiddenLayer(rng=rng, input=next_input, \
                        activation=self.activation, pool_size=pool_size, \
                        drop_rate=d_rate, input_noise=i_noise, bias_noise=b_noise, \
                        in_dim=in_dim, out_dim=out_dim, \
                        W=init_params['W'], b=init_params['b'], \
                        b_in=init_params['b_in'], s_in=init_params['s_in'], \
                        name=l_name, W_scale=i_scale)
                self.sigma_layers.append(new_layer)
                if ((init_params['b_in'] is None) or (init_params['s_in'] is None)):
                    init_params['b_in'] = new_layer.b_in
                    init_params['s_in'] = new_layer.s_in
            next_input = self.sigma_layers[-1].output
            # Acknowledge layer completion
            layer_num = layer_num + 1

        # Create a shared parameter for rescaling posterior "sigmas" to allow
        # control over the velocity of the markov chain generated by repeated
        # cycling through the INF -> GEN loop.
        if not ('sigma_scale' in self.shared_param_dicts['sigma'][-1]):
            # we use a hack-ish check to remain compatible with loading models
            # that were saved before the addition of the sigma_scale param.
            zero_ary = np.zeros((1,)).astype(theano.config.floatX)
            self.sigma_scale = theano.shared(value=zero_ary)
            new_dict = {'sigma_scale': self.sigma_scale}
            self.shared_param_dicts['sigma'].append(new_dict)
            self.set_sigma_scale(1.0)
        else:
            # this is a clone of some other InfNet, and that InfNet was made
            # after adding the sigma_scale param, so use its sigma_scale
            self.sigma_scale = \
                    self.shared_param_dicts['sigma'][-1]['sigma_scale']

        # Create a shared parameter for maintaining an exponentially decaying
        # estimate of the population mean of posterior KL divergence.
        if not ('kld_mean' in self.shared_param_dicts['sigma'][-1]):
            # add a kld_mean if none was already present
            zero_ary = np.zeros((1,)).astype(theano.config.floatX) + 100.0
            self.kld_mean = theano.shared(value=zero_ary)
            self.shared_param_dicts['sigma'][-1]['kld_mean'] = self.kld_mean
        else:
            # use a kld_mean that's already present
            self.kld_mean = self.shared_param_dicts['sigma'][-1]['kld_mean']

        # Mash all the parameters together, into a list.
        self.mlp_params = []
        for layer in self.shared_layers:
            self.mlp_params.extend(layer.params)
        for layer in self.mu_layers:
            self.mlp_params.extend(layer.params)
        for layer in self.sigma_layers:
            self.mlp_params.extend(layer.params)

        # The output of this inference network is given by the noisy output
        # of the final layers of its mu and sigma networks.
        self.output_mean = self.mu_layers[-1].linear_output
        self.output_logvar = self.sigma_layers[-1].linear_output
        self.output_sigma = self.sigma_init_scale * self.sigma_scale[0] * \
                T.exp(0.5 * self.output_logvar)

        # We'll also construct an output containing a single samples from each
        # of the distributions represented by the rows of self.output_mean and
        # self.output_sigma.
        self.output = self._construct_post_samples()
        self.out_dim = self.sigma_layers[-1].out_dim
        # Get simple regularization penalty to moderate activation dynamics
        self.act_reg_cost = self.lam_l2a * self._act_reg_cost()
        # Construct a function for penalizing KL divergence between the
        # approximate posteriors produced by this model and some isotropic
        # Gaussian distribution.
        self.kld_cost = self._construct_kld_cost()
        self.kld_mean_update = T.cast((0.98 * self.kld_mean) + \
                (0.02 * T.mean(self.kld_cost)), 'floatX')
        # Construct a theano function for sampling from the approximate
        # posteriors inferred by this model for some collection of points
        # in the "data space".
        if self.build_theano_funcs:
            self.sample_posterior = self._construct_sample_posterior()
            self.mean_posterior = theano.function([self.Xd], \
                    outputs=self.output_mean)
        else:
            self.sample_posterior = None
            self.mean_posterior = None
        return
Example #36
0
    def __init__(self,
                 n_dim,
                 n_out,
                 n_chan=1,
                 n_superbatch=12800,
                 opt_alg='adam',
                 opt_params={
                     'lr': 1e-3,
                     'b1': 0.9,
                     'b2': 0.99
                 }):
        self.numpy_rng = np.random.RandomState(1234)
        self.theano_rng = RandomStreams(self.numpy_rng.randint(2**30))

        self.n_dim = n_dim
        self.n_out = n_out
        self.n_superbatch = n_superbatch
        self.alg = opt_alg
        self.n_class = 10

        lr = opt_params.get('lr')
        n_batch = opt_params.get('nb')

        train_set_x = theano.shared(
            np.empty((n_superbatch, n_chan, n_dim, n_dim),
                     dtype=theano.config.floatX),
            borrow=False,
        )
        val_set_x = theano.shared(
            np.empty((n_superbatch, n_chan, n_dim, n_dim),
                     dtype=theano.config.floatX),
            borrow=False,
        )
        train_set_y = theano.shared(
            np.empty((n_superbatch, ), dtype=theano.config.floatX),
            borrow=False,
        )
        val_set_y = theano.shared(
            np.empty((n_superbatch, ), dtype=theano.config.floatX),
            borrow=False,
        )
        train_set_y_int = T.cast(train_set_y, 'int32')
        val_set_y_int = T.cast(val_set_y, 'int32')

        train_rbm_px_mu = theano.shared(
            np.empty((n_superbatch, self.n_aux), dtype=theano.config.floatX),
            borrow=False,
        )

        X = T.tensor4(dtype=theano.config.floatX)
        S = T.tensor3(dtype=theano.config.floatX)
        Y = T.ivector()
        px_mu = T.lscalar(dtype=config.floatX)
        idx1, idx2 = T.lscalar(), T.lscalar()
        alpha = T.scalar(dtype=theano.config.floatX)  # learning rate
        self.inputs = (X, Y, idx1, idx2, S, px_mu)

        # ----------------------------
        # Begin RBM-only
        self.rbm_network = self.create_rbm_model(n_dim, n_out, n_chan)
        persistent_chain = theano.shared(
            np.zeros((n_batch, self.n_hidden), dtype=theano.config.floatX),
            borrow=True,
        )
        rbm_cost, rbm_acc, rbm_updates = self.get_rbm_objective_and_updates(
            alpha,
            lr=lr,
            persistent=persistent_chain,
        )
        self.rbm_objectives = (rbm_cost, rbm_acc)
        self.rbm_train = theano.function(
            [idx1, idx2, alpha],
            [rbm_cost, rbm_acc],
            updates=rbm_updates,
            givens={
                X: train_set_x[idx1:idx2],
                Y: train_set_y_int[idx1:idx2]
            },
            on_unused_input='warn',
        )
        # End RBM-only
        # ----------------------------
        # Begin DADGM-only
        tau = theano.shared(
            np.float32(5.0),
            name='temperature',
            allow_downcast=True,
            borrow=False,
        )
        self.tau = tau
        self.dadgm_network = self.create_dadgm_model(
            X,
            Y,
            n_dim,
            n_out,
            n_chan,
        )
        dadgm_loss, dadgm_acc = self.create_dadgm_objectives(False)
        self.dadgm_objectives = (dadgm_loss, dadgm_acc)
        dadgm_params = self.get_dadgm_params()
        dadgm_grads = self.create_dadgm_gradients(dadgm_loss, False)
        dadgm_updates = self.create_dadgm_updates(
            dadgm_grads,
            dadgm_params,
            alpha,
            opt_alg,
            opt_params,
        )
        self.dadgm_train = theano.function(
            [idx1, idx2, alpha],
            [dadgm_loss, dadgm_acc],
            updates=dadgm_updates,
            givens={
                X: train_set_x[idx1:idx2],
                Y: train_set_y_int[idx1:idx2],
                px_mu: train_rbm_px_mu,
            },
            on_unused_input='warn',
        )
        self.dadgm_loss = theano.function(
            [X, Y],
            [dadgm_loss, dadgm_acc],
            on_unused_input='warn',
        )
        # End DADGM-only
        # ----------------------------
        self.n_batch = n_batch
        # parameters for sampling
        self.n_chain = 100

        # save data variables
        self.train_set_x = train_set_x
        self.train_set_y = train_set_y
        self.val_set_x = val_set_x
        self.val_set_y = val_set_y
        self.train_rbm_px_mu = train_rbm_px_mu
        self.data_loaded = False
def evaluate_lenet5(learning_rate=0.0001,
                    n_epochs=2000,
                    nkerns=[256, 256],
                    batch_size=1,
                    window_width=[4, 4],
                    maxSentLength=64,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.0006,
                    Div_reg=0.06,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=40):
    maxSentLength = max_truncate + 2 * (window_width[0] - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/'
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size = load_wikiQA_corpus(
        rootPath + 'vocab.txt', rootPath + 'WikiQA-train.txt',
        rootPath + 'test_filtered.txt', max_truncate,
        maxSentLength)  #vocab_size contain train, dev and test
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    mtPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test = load_mts_wikiQA(
        mtPath + 'result_train/concate_2mt_train.txt',
        mtPath + 'result_test/concate_2mt_test.txt')
    wm_train, wm_test = load_wmf_wikiQA(
        rootPath + 'train_word_matching_scores.txt',
        rootPath + 'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[
        0]
    indices_train_l = indices_train[::2, :]
    indices_train_r = indices_train[1::2, :]
    trainLengths_l = trainLengths[::2]
    trainLengths_r = trainLengths[1::2]
    normalized_train_length_l = normalized_train_length[::2]
    normalized_train_length_r = normalized_train_length[1::2]

    trainLeftPad_l = trainLeftPad[::2]
    trainLeftPad_r = trainLeftPad[1::2]
    trainRightPad_l = trainRightPad[::2]
    trainRightPad_r = trainRightPad[1::2]
    indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[
        1]
    indices_test_l = indices_test[::2, :]
    indices_test_r = indices_test[1::2, :]
    testLengths_l = testLengths[::2]
    testLengths_r = testLengths[1::2]
    normalized_test_length_l = normalized_test_length[::2]
    normalized_test_length_r = normalized_test_length[1::2]

    testLeftPad_l = testLeftPad[::2]
    testLeftPad_r = testLeftPad[1::2]
    testRightPad_l = testRightPad[::2]
    testRightPad_r = testRightPad[1::2]

    n_train_batches = indices_train_l.shape[0] / batch_size
    n_test_batches = indices_test_l.shape[0] / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_l = theano.shared(numpy.asarray(indices_train_l,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_train_r = theano.shared(numpy.asarray(indices_train_r,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_test_l = theano.shared(numpy.asarray(indices_test_l,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_test_r = theano.shared(numpy.asarray(indices_test_r,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_train_l = T.cast(indices_train_l, 'int64')
    indices_train_r = T.cast(indices_train_r, 'int64')
    indices_test_l = T.cast(indices_test_l, 'int64')
    indices_test_r = T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix(
        'x_index_l')  # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')
    left_l = T.lscalar()
    right_l = T.lscalar()
    left_r = T.lscalar()
    right_r = T.lscalar()
    length_l = T.lscalar()
    length_r = T.lscalar()
    norm_length_l = T.dscalar()
    norm_length_r = T.dscalar()
    mts = T.dmatrix()
    wmf = T.dmatrix()
    cost_tmp = T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size = (emb_size, window_width[0])
    filter_size_2 = (nkerns[0], window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv = ishape[1] + filter_size[1] - 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()
    layer0_r_input = embeddings[x_index_r.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()

    l_input_tensor = debug_print(
        Matrix_Bit_Shift(layer0_l_input[:, left_l:-right_l]), 'l_input_tensor')
    r_input_tensor = debug_print(
        Matrix_Bit_Shift(layer0_r_input[:, left_r:-right_r]), 'r_input_tensor')

    addition_l = T.sum(layer0_l_input[:, left_l:-right_l], axis=1)
    addition_r = T.sum(layer0_r_input[:, left_r:-right_r], axis=1)
    cosine_addition = cosine(addition_l, addition_r)
    eucli_addition = 1.0 / (1.0 + EUCLID(addition_l, addition_r))  #25.2%

    U, W, b = create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para = [U, W, b]

    layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor,
                                       hidden_dim=nkerns[0],
                                       U=U,
                                       W=W,
                                       b=b,
                                       bptt_truncate=-1)
    layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor,
                                       hidden_dim=nkerns[0],
                                       U=U,
                                       W=W,
                                       b=b,
                                       bptt_truncate=-1)

    cosine_sent = cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
    eucli_sent = 1.0 / (1.0 + EUCLID(layer0_A1.output_sent_rep,
                                     layer0_A2.output_sent_rep))  #25.2%

    #ibm attentive pooling at extended sentence level
    attention_matrix = compute_simi_feature_matrix_with_matrix(
        layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim,
        layer0_A2.dim,
        maxSentLength * (maxSentLength + 1) / 2)
    #     attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose()
    #     ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose()
    #     attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose()
    #     ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose()
    #     cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended)
    #     eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2%

    #ibm attentive pooling at original sentence level
    simi_matrix_sent = compute_simi_feature_matrix_with_matrix(
        layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates,
        length_l, length_r, maxSentLength)
    attention_vec_l = T.nnet.softmax(T.max(simi_matrix_sent,
                                           axis=1)).transpose()
    ibm_l = layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose()
    attention_vec_r = T.nnet.softmax(T.max(simi_matrix_sent,
                                           axis=0)).transpose()
    ibm_r = layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose()
    cosine_ibm = cosine(ibm_l, ibm_r)
    eucli_ibm = 1.0 / (1.0 + EUCLID(ibm_l, ibm_r))  #25.2%

    l_max_attention = T.max(attention_matrix, axis=1)
    neighborsArgSorted = T.argsort(l_max_attention)
    kNeighborsArg = neighborsArgSorted[-3:]  #only average the max 3 vectors
    ll = T.sort(kNeighborsArg).flatten()  # make y indices in acending lie

    r_max_attention = T.max(attention_matrix, axis=0)
    neighborsArgSorted_r = T.argsort(r_max_attention)
    kNeighborsArg_r = neighborsArgSorted_r[
        -3:]  #only average the max 3 vectors
    rr = T.sort(kNeighborsArg_r).flatten()  # make y indices in acending lie

    l_max_min_attention = debug_print(layer0_A1.output_matrix[:, ll],
                                      'l_max_min_attention')
    r_max_min_attention = debug_print(layer0_A2.output_matrix[:, rr],
                                      'r_max_min_attention')

    U1, W1, b1 = create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para = [U1, W1, b1]

    layer1_A1 = GRU_Matrix_Input(X=l_max_min_attention,
                                 word_dim=nkerns[0],
                                 hidden_dim=nkerns[1],
                                 U=U1,
                                 W=W1,
                                 b=b1,
                                 bptt_truncate=-1)
    layer1_A2 = GRU_Matrix_Input(X=r_max_min_attention,
                                 word_dim=nkerns[0],
                                 hidden_dim=nkerns[1],
                                 U=U1,
                                 W=W1,
                                 b=b1,
                                 bptt_truncate=-1)

    vec_l = debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])),
                        'vec_l')
    vec_r = debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])),
                        'vec_r')

    #     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    #     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    #     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    #     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    #
    uni_cosine = cosine(vec_l, vec_r)
    #     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    #     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')
    #     '''
    #     linear=Linear(sum_uni_l, sum_uni_r)
    #     poly=Poly(sum_uni_l, sum_uni_r)
    #     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    #     rbf=RBF(sum_uni_l, sum_uni_r)
    #     gesd=GESD(sum_uni_l, sum_uni_r)
    #     '''
    eucli_1 = 1.0 / (1.0 + EUCLID(vec_l, vec_r))  #25.2%
    #     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    #
    len_l = norm_length_l.reshape((1, 1))
    len_r = norm_length_r.reshape((1, 1))
    #
    #     '''
    #     len_l=length_l.reshape((1,1))
    #     len_r=length_r.reshape((1,1))
    #     '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input = T.concatenate(
        [
            vec_l,
            vec_r,
            uni_cosine,
            eucli_1,
            cosine_addition,
            eucli_addition,
            #                                 cosine_sent, eucli_sent,
            ibm_l.reshape((1, nkerns[0])),
            ibm_r.reshape((1, nkerns[0])),  #2*nkerns[0]+
            cosine_ibm,
            eucli_ibm,
            len_l,
            len_r,
            wmf
        ],
        axis=1)  #, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3 = LogisticRegression(rng,
                                input=layer3_input,
                                n_in=(2 * nkerns[1] + 2) + 2 +
                                (2 * nkerns[0] + 2) + 2 + 2,
                                n_out=2)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer3.W**2).sum() + (U**2).sum() + (W**2).sum() + (U1**2).sum() +
        (W1**2).sum(), 'L2_reg'
    )  #+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg = Diversify_Reg(layer3.W.T) + Diversify_Reg(
        U[0]) + Diversify_Reg(W[0]) + Diversify_Reg(U1[0]) + Diversify_Reg(
            W1[0]) + Diversify_Reg(U[1]) + Diversify_Reg(W[1]) + Diversify_Reg(
                U1[1]) + Diversify_Reg(W1[1]) + Diversify_Reg(
                    U[2]) + Diversify_Reg(W[2]) + Diversify_Reg(
                        U1[2]) + Diversify_Reg(W1[2])
    cost_this = debug_print(layer3.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print((cost_this + cost_tmp) / update_freq +
                       L2_weight * L2_reg + Div_reg * diversify_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index], [layer3.prop_for_posi, layer3_input, y],
        givens={
            x_index_l: indices_test_l[index:index + batch_size],
            x_index_r: indices_test_r[index:index + batch_size],
            y: testY[index:index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index:index + batch_size],
            wmf: wm_test[index:index + batch_size]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params + layer1_para + layer0_para  #+[embeddings]# + layer1.params

    #     params_conv = [conv_W, conv_b]

    #     accumulator=[]
    #     for para_i in params:
    #         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
    #         accumulator.append(theano.shared(eps_p, borrow=True))
    #
    #     # create a list of gradients for all model parameters
    #     grads = T.grad(cost, params)
    #
    #     updates = []
    #     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
    #         grad_i=debug_print(grad_i,'grad_i')
    #         acc = acc_i + T.sqr(grad_i)
    #         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
    #         updates.append((acc_i, acc))

    def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        grads = T.grad(cost, params)
        i = theano.shared(numpy.float64(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, grads):
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates

    updates = Adam(cost=cost, params=params, lr=learning_rate)

    train_model = theano.function(
        [index, cost_tmp],
        cost,
        updates=updates,
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            wmf: wm_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index], [cost_this, layer3.errors(y), layer3_input, y],
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            wmf: wm_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False

    svm_max = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp = 0.0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #             print batch_start
            if iter % update_freq != 0:
                cost_ij, error_ij, layer3_input, y = train_model_predict(
                    batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp += cost_ij
                error_sum += error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average = train_model(batch_start, cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum = 0
                cost_tmp = 0.0  #reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + ' error: ' + str(
                            error_sum) + '/' + str(
                                update_freq) + ' error rate: ' + str(
                                    error_sum * 1.0 / update_freq)
            #if iter ==1:
            #    exit(0)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_probs = []
                test_y = []
                test_features = []
                for i in test_batch_start:
                    prob_i, layer3_input, y = test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_probs.append(prob_i[0][0])
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])

                MAP, MRR = compute_map_mrr(rootPath + 'test_filtered.txt',
                                           test_probs)
                #now, check MAP and MRR
                print(
                    ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best '
                     'model %f, MRR  %f') %
                    (epoch, minibatch_index, n_train_batches, MAP, MRR))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                train_y = []
                train_features = []
                count = 0
                for batch_start in train_batch_start:
                    cost_ij, error_ij, layer3_input, y = train_model_predict(
                        batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results_svm = clf.decision_function(test_features)
                MAP_svm, MRR_svm = compute_map_mrr(
                    rootPath + 'test_filtered.txt', results_svm)

                lr = LinearRegression().fit(train_features, train_y)
                results_lr = lr.predict(test_features)
                MAP_lr, MRR_lr = compute_map_mrr(
                    rootPath + 'test_filtered.txt', results_lr)
                print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr

            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    def create_objectives(self, deterministic=False):
        # load network input
        X = self.inputs[0]
        x = X.flatten(2)

        # duplicate entries to take into account multiple mc samples
        n_sam = self.n_sample
        n_out = x.shape[1]
        x = x.dimshuffle(0, 'x', 1).repeat(n_sam, axis=1).reshape((-1, n_out))

        # load network
        l_px_mu, l_px_logsigma, l_pa_mu, l_pa_logsigma, \
        l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, \
        l_qa, l_qz = self.network
        l_qa_in, l_px_in = self.input_layers

        # load network output
        qz_mu, qz_logsigma, qa_mu, qa_logsigma, a, z \
            = lasagne.layers.get_output(
                [l_qz_mu, l_qz_logsigma, l_qa_mu, l_qa_logsigma, l_qa, l_qz],
                deterministic=deterministic,
            )
        pa_mu, pa_logsigma = lasagne.layers.get_output(
            [l_pa_mu, l_pa_logsigma],
            {l_px_in: z},
            deterministic=deterministic,
        )

        if self.model == 'bernoulli':
            px_mu = lasagne.layers.get_output(l_px_mu, {l_px_in: z},
                                              deterministic=deterministic)
        elif self.model == 'gaussian':
            px_mu, px_logsigma = lasagne.layers.get_output(
                [l_px_mu, l_px_logsigma],
                {l_px_in: z},
                deterministic=deterministic,
            )

        # entropy term
        log_qa_given_x = log_normal2(a, qa_mu, qa_logsigma).sum(axis=1)
        log_qz_given_ax = log_normal2(z, qz_mu, qz_logsigma).sum(axis=1)
        log_qza_given_x = log_qz_given_ax + log_qa_given_x

        # log-probability term
        z_prior_sigma = T.cast(T.ones_like(qz_logsigma),
                               dtype=theano.config.floatX)
        z_prior_mu = T.cast(T.zeros_like(qz_mu), dtype=theano.config.floatX)
        log_pz = log_normal(z, z_prior_mu, z_prior_sigma).sum(axis=1)
        log_pa_given_z = log_normal2(a, pa_mu, pa_logsigma).sum(axis=1)

        if self.model == 'bernoulli':
            log_px_given_z = log_bernoulli(x, px_mu).sum(axis=1)
        elif self.model == 'gaussian':
            log_px_given_z = log_normal2(x, px_mu, px_logsigma).sum(axis=1)

        log_paxz = log_pa_given_z + log_px_given_z + log_pz

        # compute the evidence lower bound
        elbo = T.mean(log_paxz - log_qza_given_x)

        # we don't use a spearate accuracy metric right now
        return -elbo, T.mean(qz_logsigma)
generator_params = {}

random_seed = 1234
rng = np.random.RandomState(random_seed)
srng = theano.tensor.shared_randomstreams.RandomStreams(rng.randint(999999))

#using 400/1200/10
num_hidden_discriminator = 400
num_hidden_generator = 1200
var_dimensionality = 200

#using 0.01
scale_disc = 0.05
scale_gen = 0.05

castx = lambda x: T.cast(x, theano.config.floatX)

discriminator_params["W1_d"] = theano.shared(np.asarray(1.0 * np.random.uniform(-1.0 * scale_disc, 1.0 * scale_disc, (1, num_hidden_discriminator * 5)), dtype = theano.config.floatX))
discriminator_params["b1_d"] = theano.shared(np.asarray(0.0 + 0.0 * np.random.normal(0, 1, (5 * num_hidden_discriminator,)), dtype = theano.config.floatX))

discriminator_params["W2_d"] = theano.shared(np.asarray(1.0 * np.random.uniform(-1.0 * scale_disc, 1.0 * scale_disc, (num_hidden_discriminator, num_hidden_discriminator * 5)), dtype = theano.config.floatX))
discriminator_params["b2_d"] = theano.shared(np.asarray(0.0 + 0.0 * np.random.normal(0, 0.1, (5 * num_hidden_discriminator,)), dtype = theano.config.floatX))

discriminator_params["W3_d"] = theano.shared(np.asarray(1.0 * np.random.uniform(-1.0 * scale_disc, 1.0 * scale_disc, (num_hidden_discriminator, 1)), dtype = theano.config.floatX))
discriminator_params["b3_d"] = theano.shared(np.asarray(0.0 * np.random.normal(0, 0.1, (1,)), dtype = theano.config.floatX))

generator_params["W1_g"] = theano.shared(np.asarray(1.0 * np.random.uniform(-1.0 * scale_gen, 1.0 * scale_gen, (var_dimensionality, num_hidden_generator)), dtype = theano.config.floatX), name = "W1_g")
generator_params["b1_g"] = theano.shared(np.asarray(0.0 + 0.0 * np.random.normal(0, 1, (num_hidden_generator,)), dtype = theano.config.floatX))

generator_params["W2_g"] = theano.shared(np.asarray(1.0 * np.random.uniform(-1.0 * scale_gen, 1.0 * scale_gen, (num_hidden_generator, num_hidden_generator)), dtype = theano.config.floatX))
generator_params["b2_g"] = theano.shared(np.asarray(0.0 + 0.0 * np.random.normal(0, 1, (num_hidden_generator,)), dtype = theano.config.floatX))
Example #40
0
 def recon_err_(self, v_in):
     return T.sum((self.recon_(v_in) - v_in)**2) / T.cast(v_in.shape[0], fx)
Example #41
0
    def get_cost_updates(self, lr=0.1, persistent=None, k=1):
        """This functions implements one step of CD-k or PCD-k

        :param lr: learning rate used to train the RBM

        :param persistent: None for CD. For PCD, shared variable
            containing old state of Gibbs chain. This must be a shared
            variable of size (batch size, number of hidden units).

        :param k: number of Gibbs steps to do in CD-k/PCD-k

        Returns a proxy for the cost and the updates dictionary. The
        dictionary contains the update rules for weights and biases but
        also an update of the shared variable used to store the persistent
        chain, if one is used.

        """

        # compute positive phase
        pre_sigmoid_ph, ph_mean, ph_sample = self.sample_h_given_v(self.input)

        # decide how to initialize persistent chain:
        # for CD, we use the newly generate hidden sample
        # for PCD, we initialize from the old state of the chain
        if persistent is None:
            chain_start = ph_sample
        else:
            chain_start = persistent
        # end-snippet-2
        # perform actual negative phase
        # in order to implement CD-k/PCD-k we need to scan over the
        # function that implements one gibbs step k times.
        # Read Theano tutorial on scan for more information :
        # http://deeplearning.net/software/theano/library/scan.html
        # the scan will return the entire Gibbs chain
        ([
            pre_sigmoid_nvs, nv_means, nv_samples, pre_sigmoid_nhs, nh_means,
            nh_samples
        ], updates) = theano.scan(
            self.gibbs_hvh,
            # the None are place holders, saying that
            # chain_start is the initial state corresponding to the
            # 6th output
            outputs_info=[None, None, None, None, None, chain_start],
            n_steps=k,
            name="gibbs_hvh")
        # start-snippet-3
        # determine gradients on RBM parameters
        # note that we only need the sample at the end of the chain
        chain_end = nv_samples[-1]

        cost = T.mean(self.free_energy(self.input)) - T.mean(
            self.free_energy(chain_end))
        # We must not compute the gradient through the gibbs sampling
        gparams = T.grad(cost, self.params, consider_constant=[chain_end])
        # end-snippet-3 start-snippet-4
        # constructs the update dictionary
        for gparam, param in zip(gparams, self.params):
            # make sure that the learning rate is of the right dtype
            updates[param] = param - gparam * T.cast(
                lr, dtype=theano.config.floatX)
        if persistent:
            # Note that this works only if persistent is a shared variable
            updates[persistent] = nh_samples[-1]
            # pseudo-likelihood is a better proxy for PCD
            monitoring_cost = self.get_pseudo_likelihood_cost(updates)
        else:
            # reconstruction cross-entropy is a better proxy for CD
            monitoring_cost = self.get_reconstruction_cost(
                updates, pre_sigmoid_nvs[-1])

        return monitoring_cost, updates
Example #42
0
def train_conv_net(datasets,
                   U,
                   lr_decay=0.95,
                   img_w=300,
                   filter_hs=[3, 4, 5],
                   conv_non_linear="relu",
                   hidden_units=[100, 3],
                   shuffle_batch=True,
                   n_epochs=25,
                   sqr_norm_lim=9,
                   non_static=True,
                   batch_size=50,
                   activations=[Iden],
                   dropout_rate=[0.5]):
    """
    Train a simple conv net
    img_h = sentence length (padded where necessary)
    img_w = word vector length (300 for word2vec)
    filter_hs = filter window sizes    
    hidden_units = [x,y] x is the number of feature maps (per filter window), and y is the penultimate layer
    sqr_norm_lim = s^2 in the paper
    lr_decay = adadelta decay parameter
    """
    rng = np.random.RandomState(3435)
    img_h = len(datasets[0][0]) - 1
    filter_w = img_w
    feature_maps = hidden_units[0]
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1))
    parameters = [("image shape", img_h, img_w),
                  ("filter shape", filter_shapes),
                  ("hidden_units", hidden_units), ("dropout", dropout_rate),
                  ("batch_size", batch_size), ("non_static", non_static),
                  ("learn_decay", lr_decay),
                  ("conv_non_linear", conv_non_linear),
                  ("non_static", non_static), ("sqr_norm_lim", sqr_norm_lim),
                  ("shuffle_batch", shuffle_batch)]
    print parameters

    #define model architecture
    index = T.lscalar()
    x = T.matrix('x')
    y = T.ivector('y')
    Words = theano.shared(value=U, name="Words")
    zero_vec_tensor = T.vector()
    zero_vec = np.zeros(img_w)
    set_zero = theano.function([zero_vec_tensor],
                               updates=[
                                   (Words,
                                    T.set_subtensor(Words[0, :],
                                                    zero_vec_tensor))
                               ],
                               allow_input_downcast=True)
    layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape(
        (x.shape[0], 1, x.shape[1], Words.shape[1]))
    conv_layers = []
    layer1_inputs = []
    print 'starting loop'
    for i in xrange(len(filter_hs)):
        filter_shape = filter_shapes[i]
        pool_size = pool_sizes[i]
        conv_layer = LeNetConvPoolLayer(rng,
                                        input=layer0_input,
                                        image_shape=(batch_size, 1, img_h,
                                                     img_w),
                                        filter_shape=filter_shape,
                                        poolsize=pool_size,
                                        non_linear=conv_non_linear)
        layer1_input = conv_layer.output.flatten(2)
        conv_layers.append(conv_layer)
        layer1_inputs.append(layer1_input)
    layer1_input = T.concatenate(layer1_inputs, 1)
    hidden_units[0] = feature_maps * len(filter_hs)
    classifier = MLPDropout(rng,
                            input=layer1_input,
                            layer_sizes=hidden_units,
                            activations=activations,
                            dropout_rates=dropout_rate)

    print 'defining params'
    #define parameters of the model and update functions using adadelta
    params = classifier.params
    for conv_layer in conv_layers:
        params += conv_layer.params
    if non_static:
        #if word vectors are allowed to change, add them as model parameters
        params += [Words]
    cost = classifier.negative_log_likelihood(y)
    dropout_cost = classifier.dropout_negative_log_likelihood(y)
    grad_updates = sgd_updates_adadelta(params, dropout_cost, lr_decay, 1e-6,
                                        sqr_norm_lim)

    #shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate
    #extra data (at random)
    np.random.seed(3435)
    if datasets[0].shape[0] % batch_size > 0:
        extra_data_num = batch_size - datasets[0].shape[0] % batch_size
        train_set = np.random.permutation(datasets[0])
        extra_data = train_set[:extra_data_num]
        new_data = np.append(datasets[0], extra_data, axis=0)
    else:
        new_data = datasets[0]
    new_data = np.random.permutation(new_data)
    n_batches = new_data.shape[0] / batch_size
    n_train_batches = int(np.round(n_batches * 0.9))
    #divide train set into train/val sets
    test_set_x = datasets[1][:, :img_h]
    test_set_y = np.asarray(datasets[1][:, -1], "int32")
    train_set = new_data[:n_train_batches * batch_size, :]
    val_set = new_data[n_train_batches * batch_size:, :]
    train_set_x, train_set_y = shared_dataset(
        (train_set[:, :img_h], train_set[:, -1]))
    val_set_x, val_set_y = shared_dataset((val_set[:, :img_h], val_set[:, -1]))
    n_val_batches = n_batches - n_train_batches
    val_model = theano.function(
        [index],
        classifier.errors(y),
        givens={
            x: val_set_x[index * batch_size:(index + 1) * batch_size],
            y: val_set_y[index * batch_size:(index + 1) * batch_size]
        },
        allow_input_downcast=True)

    #compile theano functions to get train/val/test errors
    test_model = theano.function(
        [index],
        classifier.errors(y),
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        },
        allow_input_downcast=True)
    train_model = theano.function(
        [index],
        cost,
        updates=grad_updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        },
        allow_input_downcast=True)
    test_pred_layers = []
    test_size = test_set_x.shape[0]
    test_layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape(
        (test_size, 1, img_h, Words.shape[1]))
    for conv_layer in conv_layers:
        test_layer0_output = conv_layer.predict(test_layer0_input, test_size)
        test_pred_layers.append(test_layer0_output.flatten(2))
    test_layer1_input = T.concatenate(test_pred_layers, 1)
    test_y_pred = classifier.predict(test_layer1_input)

    test_error = T.mean(T.neq(test_y_pred, y))
    test_model_all = theano.function([x, y],
                                     test_error,
                                     allow_input_downcast=True)

    #start training over mini-batches

    print 'sizes: '
    print 'test: '
    print test_size
    print '... training'
    print 'n_train_batches: ' + str(n_train_batches)
    epoch = 0
    best_val_perf = 0
    val_perf = 0
    test_perf = 0
    cost_epoch = 0
    while (epoch < n_epochs):
        print 'epoch: ' + str(epoch)
        start_time = time.time()
        epoch = epoch + 1
        if shuffle_batch:
            for minibatch_index in np.random.permutation(
                    range(n_train_batches)):
                if minibatch_index >= n_train_batches: minibatch_index -= 1
                print 'if: minibatch_index: ' + str(minibatch_index)
                cost_epoch = train_model(minibatch_index)
                set_zero(zero_vec)
        else:
            for minibatch_index in xrange(n_train_batches):
                if minibatch_index >= n_train_batches: minibatch_index -= 1
                print 'else: minibatch_index: ' + str(minibatch_index)

                cost_epoch = train_model(minibatch_index)
                set_zero(zero_vec)
        train_losses = [test_model(i) for i in xrange(n_train_batches)]
        train_perf = 1 - np.mean(train_losses)
        val_losses = [val_model(i) for i in xrange(n_val_batches)]
        val_perf = 1 - np.mean(val_losses)
        print(
            'epoch: %i, training time: %.2f secs, train perf: %.2f %%, val perf: %.2f %%'
            % (epoch, time.time() - start_time, train_perf * 100.,
               val_perf * 100.))
        if val_perf >= best_val_perf:
            best_val_perf = val_perf
            test_loss = test_model_all(test_set_x, test_set_y)
            test_perf = 1 - test_loss
    return test_perf
Example #43
0
def run_cnn(exp_name,
            dataset,
            embedding,
            log_fn,
            perf_fn,
            emb_dm=100,
            batch_size=100,
            filter_hs=[1, 2, 3],
            hidden_units=[200, 100, 11],
            dropout_rate=0.5,
            shuffle_batch=True,
            n_epochs=300,
            lr_decay=0.95,
            activation=ReLU,
            sqr_norm_lim=9,
            non_static=True):
    """
    Train and Evaluate CNN event encoder model
    :dataset: list containing three elements[(train_x, train_y), 
            (valid_x, valid_y), (test_x, test_y)]
    :embedding: word embedding with shape (|V| * emb_dm)
    :filter_hs: filter height for each paralle cnn layer
    :dropout_rate: dropout rate for full connected layers
    :n_epochs: the max number of iterations
    
    """
    start_time = timeit.default_timer()
    rng = np.random.RandomState(1234)

    input_height = len(dataset[0][0][0][0])
    num_sens = len(dataset[0][0][0])
    print "--input height ", input_height
    input_width = emb_dm
    num_maps = hidden_units[0]

    ###################
    # start snippet 1 #
    ###################
    print "start to construct the model ...."
    x = T.tensor3("x")
    y = T.matrix("y")

    words = shared(value=np.asarray(embedding, dtype=theano.config.floatX),
                   name="embedding",
                   borrow=True)

    # define function to keep padding vector as zero
    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(input_width, dtype=theano.config.floatX)
    set_zero = function([zero_vector_tensor],
                        updates=[(words,
                                  T.set_subtensor(words[0, :],
                                                  zero_vector_tensor))])

    layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape(
        (x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm))

    conv_layers = []
    layer1_inputs = []

    for i in xrange(len(filter_hs)):
        filter_shape = (num_maps, 1, filter_hs[i], emb_dm)
        pool_size = (input_height - filter_hs[i] + 1, 1)
        conv_layer = nn.ConvPoolLayer(rng,
                                      input=layer0_input,
                                      input_shape=None,
                                      filter_shape=filter_shape,
                                      pool_size=pool_size,
                                      activation=activation)
        sen_vecs = conv_layer.output.reshape(
            (x.shape[0], x.shape[1], num_maps))
        sen_vecs = sen_vecs.dimshuffle(0, 2, 1)
        doc_vec = T.sum(sen_vecs, axis=2).flatten(2)
        layer1_inputs.append(doc_vec)
        conv_layers.append(conv_layer)

    layer1_input = T.concatenate(layer1_inputs, 1)

    ##############
    # Task pop#
    ##############
    print "Construct classifier ...."
    hidden_units[0] = num_maps * len(filter_hs)
    pop_factor = nn.MLDropout(
        rng,
        input=layer1_input,
        layer_sizes=hidden_units,
        dropout_rates=[dropout_rate for i in range(len(hidden_units) - 1)],
        activations=[activation for i in range(len(hidden_units) - 1)])
    pop_factor_output = pop_factor.output.dimshuffle(0, 1, 'x')
    pop_factor_dropout_output = pop_factor.dropout_output.dimshuffle(0, 1, 'x')

    #######################
    # Task Type #####
    #######################
    type_hidden_units = [num for num in hidden_units]
    type_hidden_units[-1] = 5
    type_factor = nn.MLDropout(
        rng,
        input=layer1_input,
        layer_sizes=type_hidden_units,
        dropout_rates=[
            dropout_rate for i in range(len(type_hidden_units) - 1)
        ],
        activations=[activation for i in range(len(type_hidden_units) - 1)])
    type_factor_output = type_factor.output.dimshuffle(0, 'x', 1)
    type_factor_dropout_output = type_factor.dropout_output.dimshuffle(
        0, 'x', 1)

    ######################
    ## Joint Y matrix ###
    #####################
    # construct V matrix to model pop type dependency
    V_value = np.random.random((hidden_units[-1], type_hidden_units[-1]))
    V = theano.shared(value=np.asarray(V_value, dtype=theano.config.floatX),
                      name="V",
                      borrow=True)

    # compute the Joint propability
    joint_act = T.batched_dot(pop_factor_output, type_factor_output) + V
    joint_act_dropout = T.batched_dot(pop_factor_dropout_output,
                                      type_factor_dropout_output) + V

    joint_probs = T.nnet.softmax(joint_act.flatten(2))
    joint_probs_dropout = T.nnet.softmax(joint_act_dropout.flatten(2))

    neg_likelihood = -T.mean(T.log(T.sum(joint_probs * y, axis=1)))
    neg_likelihood_dropout = -T.mean(
        T.log(T.sum(joint_probs_dropout * y, axis=1)))

    joint_preds = T.argmax(joint_probs, axis=1)
    pop_preds = joint_preds // type_hidden_units[-1]
    type_preds = joint_preds % type_hidden_units[-1]

    y_index = T.argmax(y, axis=1)
    pop_y = y_index // type_hidden_units[-1]
    type_y = y_index % type_hidden_units[-1]

    pop_error = T.mean(T.neq(pop_preds, pop_y))
    type_error = T.mean(T.neq(type_preds, type_y))

    params = pop_factor.params
    params += type_factor.params
    params.append(V)

    for conv_layer in conv_layers:
        params += conv_layer.params

    if non_static:
        params.append(words)

    grad_updates = sgd_updates_adadelta(params, neg_likelihood_dropout,
                                        lr_decay, 1e-6, sqr_norm_lim)

    #####################
    # Construct Dataset #
    #####################
    print "Copy data to GPU and constrct train/valid/test func"
    np.random.seed(1234)

    train_x, train_y = shared_dataset(dataset[0])
    test_x, test_y = shared_dataset(dataset[1])

    n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
    n_test_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))

    #####################
    # Train model func #
    #####################
    index = T.iscalar()
    train_func = function(
        [index],
        neg_likelihood_dropout,
        updates=grad_updates,
        givens={
            x: train_x[index * batch_size:(index + 1) * batch_size],
            y: train_y[index * batch_size:(index + 1) * batch_size]
        })

    test_pred = function(
        [index], [pop_error, type_error],
        givens={
            x: test_x[index * batch_size:(index + 1) * batch_size],
            y: test_y[index * batch_size:(index + 1) * batch_size]
        })

    # apply early stop strategy
    patience = 100
    patience_increase = 2
    improvement_threshold = 1.005

    n_test = len(dataset[1][0])

    epoch = 0
    best_params = None
    best_validation_score = 0.
    test_perf = 0

    done_loop = False

    log_file = open(log_fn, 'a')

    while (epoch < n_epochs) and not done_loop:
        start_time = timeit.default_timer()
        epoch += 1
        costs = []
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            cost_epoch = train_func(minibatch_index)
            costs.append(cost_epoch)
            set_zero(zero_vec)

        if epoch % 5 == 0:
            # do test
            test_pop_errors = []
            test_type_errors = []
            for i in xrange(n_test_batches):
                test_pop_error, test_type_error = test_pred(i)
                test_pop_errors.append(test_pop_error)
                test_type_errors.append(test_type_error)

            test_pop_score = 1 - np.mean(test_pop_errors)
            test_type_score = 1 - np.mean(test_type_errors)

            message = "Epoch %d test pop perf %f, type perf %f" % (
                epoch, test_pop_score, test_type_score)
            print message
            log_file.write(message + "\n")
            log_file.flush()

        end_time = timeit.default_timer()
        print "Finish one iteration using %f m" % (
            (end_time - start_time) / 60.)

    log_file.flush()
    log_file.close()
Example #44
0
def shared_dataset(data_xy):
    data_x, data_y = data_xy
    shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX))
    shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX))

    return shared_x, T.cast(shared_y, 'int32')
Example #45
0
File: utils.py Project: ehfo0/DVAE
def log_likelihood_samplesImean_sigma2(samples, mean, logvar):
    return c*T.cast(samples.shape[2], 'float32') /2  - \
               T.sum(T.sqr((samples-mean)/T.exp(logvar)) + 2*logvar, axis=2) / 2
Example #46
0
def train(
        dim_word=100,
        dim_word_src=200,
        enc_dim=1000,
        dec_dim=1000,  # the number of LSTM units
        patience=-1,  # early stopping patience
        max_epochs=5000,
        finish_after=-1,  # finish after this many updates
        decay_c=0.,  # L2 regularization penalty
        alpha_c=0.,  # alignment regularization
        clip_c=-1.,  # gradient clipping threshold
        lrate=0.01,  # learning rate
        n_words_src=100000,  # source vocabulary size
        n_words=100000,  # target vocabulary size
        maxlen=1000,  # maximum length of the description
        maxlen_trg=1000,  # maximum length of the description
        maxlen_sample=1000,
        optimizer='rmsprop',
        batch_size=[1, 2, 3, 4],
        valid_batch_size=16,
        sort_size=20,
        save_path=None,
        save_file_name='model',
        save_best_models=0,
        dispFreq=100,
        validFreq=100,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=-1,
        pbatchFreq=-1,
        verboseFreq=10000,
        datasets=[
            'data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'
        ],
        valid_datasets=[
            '../data/dev/newstest2011.en.tok',
            '../data/dev/newstest2011.fr.tok'
        ],
        dictionaries=[
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'
        ],
        source_word_level=0,
        target_word_level=0,
        use_dropout=False,
        re_load=False,
        re_load_old_setting=False,
        uidx=None,
        eidx=None,
        cidx=None,
        layers=None,
        save_every_saveFreq=0,
        save_burn_in=20000,
        use_bpe=0,
        init_params=None,
        build_model=None,
        build_sampler=None,
        gen_sample=None,
        **kwargs):

    # Model options
    model_options = locals().copy()
    del model_options['init_params']
    del model_options['build_model']
    del model_options['build_sampler']
    del model_options['gen_sample']

    # load dictionaries and invert them
    # dictionaries[0] : src
    # dictionaries[1] : trg
    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    # ii, dd : 0 = source, 1 = target
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = cPickle.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    print 'Building model'
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    file_name = '%s%s.npz' % (save_path, save_file_name)
    best_file_name = '%s%s.best.npz' % (save_path, save_file_name)
    opt_file_name = '%s%s%s.npz' % (save_path, save_file_name, '.grads')
    best_opt_file_name = '%s%s%s.best.npz' % (save_path, save_file_name,
                                              '.grads')
    model_name = '%s%s.pkl' % (save_path, save_file_name)
    params = init_params(model_options)
    cPickle.dump(model_options, open(model_name, 'wb'))
    history_errs = [[], [], [], []]

    # reload options
    # reload : False
    if re_load and os.path.exists(file_name):
        print 'You are reloading your experiment.. do not panic dude..'
        if re_load_old_setting:
            with open(model_name, 'rb') as f:
                models_options = cPickle.load(f)
        params = load_params(file_name, params)
        # reload history
        model = numpy.load(file_name)
        history_errs = list(lst.tolist() for lst in model['history_errs'])
        if uidx is None:
            uidx = model['uidx']
        if eidx is None:
            eidx = model['eidx']
        if cidx is None:
            try:
                cidx = model['cidx']
            except:
                cidx = 0
    else:
        if uidx is None:
            uidx = 0
        if eidx is None:
            eidx = 0
        if cidx is None:
            cidx = 0

    print 'Loading data'

    train = MultiTextIterator(source=datasets[0],
                              target=datasets[1],
                              source_dict=dictionaries[0],
                              target_dict=dictionaries[1],
                              n_words_source=n_words_src,
                              n_words_target=n_words,
                              source_word_level=source_word_level,
                              target_word_level=target_word_level,
                              batch_size=batch_size,
                              sort_size=sort_size)

    valid = [
        TextIterator(source=valid_dataset[0],
                     target=valid_dataset[1],
                     source_dict=dictionaries[0],
                     target_dict=dictionaries[1],
                     n_words_source=n_words_src,
                     n_words_target=n_words,
                     source_word_level=source_word_level,
                     target_word_level=target_word_level,
                     batch_size=valid_batch_size,
                     sort_size=sort_size) for valid_dataset in valid_datasets
    ]

    # create shared variables for parameters
    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    # NOTE : this is where we build the model
    inps = [x, x_mask, y, y_mask]

    print 'Building sampler...\n',
    f_init, f_next = build_sampler(tparams, model_options, trng, use_noise)
    #print 'Done'

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    # NOTE : f_log_probs : [x, x_mask, y, y_mask], cost
    print 'Done'

    if re_load:  # NOTE : this whole thing is False
        use_noise.set_value(0.)
        valid_scores = []
        for ii, vv in enumerate(valid):

            valid_errs = pred_probs(f_log_probs,
                                    prepare_data,
                                    model_options,
                                    vv,
                                    verboseFreq=verboseFreq)
            valid_err = valid_errs.mean()

            if numpy.isnan(valid_err):
                import ipdb
                ipdb.set_trace()

            print 'Reload sanity check: Valid ', valid_err

    cost = cost.mean()

    # apply L2 regularization on weights
    # decay_c : 0
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    # alpha_c : 0
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    # NOTE : why is this not referenced somewhere later?
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    if clip_c > 0:
        grads, not_finite, clipped = gradient_clipping(grads, tparams, clip_c)
    else:
        not_finite = 0
        clipped = 0

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    if re_load and os.path.exists(file_name):
        if clip_c > 0:
            f_grad_shared, f_update, toptparams = eval(optimizer)(
                lr,
                tparams,
                grads,
                inps,
                cost=cost,
                not_finite=not_finite,
                clipped=clipped,
                file_name=opt_file_name)
        else:
            f_grad_shared, f_update, toptparams = eval(optimizer)(
                lr, tparams, grads, inps, cost=cost, file_name=opt_file_name)
    else:
        # re_load = False, clip_c = 1
        if clip_c > 0:
            f_grad_shared, f_update, toptparams = eval(optimizer)(
                lr,
                tparams,
                grads,
                inps,
                cost=cost,
                not_finite=not_finite,
                clipped=clipped)
        else:
            f_grad_shared, f_update, toptparams = eval(optimizer)(lr,
                                                                  tparams,
                                                                  grads,
                                                                  inps,
                                                                  cost=cost)

            # f_grad_shared = theano.function(inp, [cost, not_finite, clipped], updates=gsup, profile=profile)

            # f_update = theano.function([lr], [], updates=updates,
            #                   on_unused_input='ignore', profile=profile)
            # toptparams

    print 'Done'

    print 'Optimization'
    best_p = None
    bad_counter = 0

    # will never be true
    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    # Training loop
    ud_start = time.time()
    estop = False

    if re_load:
        # IndexError: index 14 is out of bounds for axis 1 with size 13
        print "Checkpointed minibatch number: %d" % cidx
        for cc in xrange(cidx):
            if numpy.mod(cc, 1000) == 0:
                print "Jumping [%d / %d] examples" % (cc, cidx)
            train.next()

    for epoch in xrange(max_epochs):
        time0 = time.time()
        n_samples = 0
        NaN_grad_cnt = 0
        NaN_cost_cnt = 0
        clipped_cnt = 0
        update_idx = 0
        if re_load:
            re_load = 0
        else:
            cidx = 0

        for x, y in train:
            # NOTE : x, y are [sen1, sen2, sen3 ...] where sen_i are of different length
            update_idx += 1
            cidx += 1
            uidx += 1
            use_noise.set_value(1.)

            # NOTE : n_x <= batch_size
            x, x_mask, y, y_mask, n_x = prepare_data(x,
                                                     y,
                                                     maxlen=maxlen,
                                                     maxlen_trg=maxlen_trg,
                                                     n_words_src=n_words_src,
                                                     n_words=n_words)
            n_samples += n_x

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                uidx = max(uidx, 0)
                continue

            # compute cost, grads and copy grads to shared variables

            if clip_c > 0:
                cost, not_finite, clipped = f_grad_shared(x, x_mask, y, y_mask)
            else:
                cost = f_grad_shared(x, x_mask, y, y_mask)

            if clipped:
                clipped_cnt += 1

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if numpy.isnan(cost) or numpy.isinf(cost):
                import ipdb
                ipdb.set_trace()
                NaN_cost_cnt += 1

            if not_finite:
                import ipdb
                ipdb.set_trace()
                NaN_grad_cnt += 1
                continue

            # do the update on parameters
            f_update(lrate)

            if numpy.isnan(cost) or numpy.isinf(cost):
                continue

            if float(NaN_grad_cnt) > max_epochs * 0.5 or float(
                    NaN_cost_cnt) > max_epochs * 0.5:
                print 'Too many NaNs, abort training'
                return 1., 1., 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                ud = time.time() - ud_start
                wps = n_samples / float(time.time() - time0)
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'NaN_in_grad', NaN_grad_cnt,\
                      'NaN_in_cost', NaN_cost_cnt, 'Gradient_clipped', clipped_cnt, 'UD ', ud, "%.2f sentence/s" % wps
                ud_start = time.time()

            if numpy.mod(uidx, pbatchFreq) == 0 and pbatchFreq != -1:
                pbatch(x, worddicts_r[0])

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0 and sampleFreq != -1:

                gen_list = [
                    0, batch_size[0], batch_size[0] + batch_size[1],
                    batch_size[0] + batch_size[1] + batch_size[2]
                ]
                gen_list = [ii for ii in gen_list if ii < n_x]

                for jj in gen_list:
                    # jj = min(5, n_samples)
                    stochastic = True
                    use_noise.set_value(0.)

                    # x : maxlen X n_samples
                    sample, score = gen_sample(tparams,
                                               f_init,
                                               f_next,
                                               x[:, jj][:, None],
                                               model_options,
                                               trng=trng,
                                               k=1,
                                               maxlen=maxlen_sample,
                                               stochastic=stochastic,
                                               argmax=False)
                    print
                    print 'Source ', jj, ': ',
                    if source_word_level:
                        for vv in x[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[0]:
                                if use_bpe:
                                    print(worddicts_r[0][vv]).replace(
                                        '@@', ''),
                                else:
                                    print worddicts_r[0][vv],
                            else:
                                print 'UNK',
                        print
                    else:
                        source_ = []
                        for vv in x[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[0]:
                                source_.append(worddicts_r[0][vv])
                            else:
                                source_.append('UNK')
                        print "".join(source_)
                    print 'Truth ', jj, ' : ',
                    if target_word_level:
                        for vv in y[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                if use_bpe:
                                    print(worddicts_r[1][vv]).replace(
                                        '@@', ''),
                                else:
                                    print worddicts_r[1][vv],
                            else:
                                print 'UNK',
                        print
                    else:
                        truth_ = []
                        for vv in y[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                truth_.append(worddicts_r[1][vv])
                            else:
                                truth_.append('UNK')
                        print "".join(truth_)
                    print 'Sample ', jj, ': ',
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    if target_word_level:
                        for vv in ss:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                if use_bpe:
                                    print(worddicts_r[1][vv]).replace(
                                        '@@', ''),
                                else:
                                    print worddicts_r[1][vv],
                            else:
                                print 'UNK',
                        print
                    else:
                        sample_ = []
                        for vv in ss:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                sample_.append(worddicts_r[1][vv])
                            else:
                                sample_.append('UNK')
                        print "".join(sample_)
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                valid_scores = []
                for ii, vv in enumerate(valid):
                    use_noise.set_value(0.)
                    # NOTE : when validation, don't pass maxlen, maxlen_trg
                    # meaning, don't limit sentence lengths...
                    # sort of makes sense i suppose?
                    valid_errs = pred_probs(
                        f_log_probs,
                        prepare_data,
                        model_options,
                        vv,
                        verboseFreq=verboseFreq,
                    )
                    valid_err = valid_errs.mean()
                    valid_scores.append(valid_err)
                    history_errs[ii].append(valid_err)

                    # patience == -1, never happens
                    if len(history_errs[ii]) > patience and valid_err >= \
                            numpy.array(history_errs[ii])[:-patience].min() and patience != -1:
                        bad_counter += 1
                        if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

                    if numpy.isnan(valid_err):
                        import ipdb
                        ipdb.set_trace()

                cnt = 0
                for ii in xrange(4):
                    if uidx == 0 or valid_scores[ii] <= numpy.array(
                            history_errs[ii]).min():
                        cnt += 1

                if len(history_errs[0]) > 1:
                    if numpy.sum(valid_scores) <= numpy.sum(
                        [aa[:-2] for aa in history_errs]):
                        less_sum = True
                    else:
                        less_sum = False
                else:
                    less_sum = True

                if cnt >= 2 and less_sum:
                    best_p = unzip(tparams)
                    best_optp = unzip(toptparams)
                    bad_counter = 0

                if saveFreq != validFreq and save_best_models:
                    numpy.savez(best_file_name,
                                history_errs=history_errs,
                                uidx=uidx,
                                eidx=eidx,
                                cidx=cdix,
                                **best_p)
                    numpy.savez(best_opt_file_name, **best_optp)

                print 'Valid : DE {}\t CS {}\t FI {}\t RU {}'.format(
                    valid_scores[0], valid_scores[1], valid_scores[2],
                    valid_scores[3])

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if not os.path.exists(save_path):
                    os.mkdir(save_path)

                params = unzip(tparams)
                optparams = unzip(toptparams)
                numpy.savez(file_name,
                            history_errs=history_errs,
                            uidx=uidx,
                            eidx=eidx,
                            cidx=cidx,
                            **params)
                numpy.savez(opt_file_name, **optparams)

                if save_every_saveFreq and (uidx >= save_burn_in):
                    this_file_name = '%s%s.%d.npz' % (save_path,
                                                      save_file_name, uidx)
                    this_opt_file_name = '%s%s%s.%d.npz' % (
                        save_path, save_file_name, '.grads', uidx)
                    numpy.savez(this_file_name,
                                history_errs=history_errs,
                                uidx=uidx,
                                eidx=eidx,
                                cidx=cidx,
                                **params)
                    numpy.savez(this_opt_file_name,
                                history_errs=history_errs,
                                uidx=uidx,
                                eidx=eidx,
                                cidx=cidx,
                                **params)
                    if best_p is not None and saveFreq != validFreq:
                        this_best_file_name = '%s%s.%d.best.npz' % (
                            save_path, save_file_name, uidx)
                        numpy.savez(this_best_file_name,
                                    history_errs=history_errs,
                                    uidx=uidx,
                                    eidx=eidx,
                                    cidx=cidx,
                                    **best_p)
                print 'Done...',
                print 'Saved to %s' % file_name

            # finish after this many updates
            if uidx >= finish_after and finish_after != -1:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples
        lang_nos = (4535523, 12122376, 1926115, 2326893)
        lang_done = [x * update_idx for x in batch_size]
        lang_rem = [x - y for x, y in zip(lang_nos, lang_done)]
        print "Remaining : DE({}), CS({}), FI({}), RU({})".format(
            lang_rem[0], lang_rem[1], lang_rem[2], lang_rem[3])
        eidx += 1

        if estop:
            break

    use_noise.set_value(0.)

    valid_scores = []
    for ii, vv in enumerate(valid):
        valid_err = pred_probs(f_log_probs, prepare_data, model_options,
                               vv).mean()
        valid_scores.append(valid_err)

    print 'Valid : DE {}\t CS {}\t FI {}\t RU {}'.format(
        valid_scores[0], valid_scores[1], valid_scores[2], valid_scores[3])

    params = unzip(tparams)
    optparams = unzip(toptparams)
    file_name = '%s%s.%d.npz' % (save_path, save_file_name, uidx)
    opt_file_name = '%s%s%s.%d.npz' % (save_path, save_file_name, '.grads',
                                       uidx)
    numpy.savez(file_name,
                history_errs=history_errs,
                uidx=uidx,
                eidx=eidx,
                cidx=cidx,
                **params)
    numpy.savez(opt_file_name, **optparams)
    if best_p is not None and saveFreq != validFreq:
        best_file_name = '%s%s.%d.best.npz' % (save_path, save_file_name, uidx)
        best_opt_file_name = '%s%s%s.%d.best.npz' % (save_path, save_file_name,
                                                     '.grads', uidx)
        numpy.savez(best_file_name,
                    history_errs=history_errs,
                    uidx=uidx,
                    eidx=eidx,
                    cidx=cidx,
                    **best_p)
        numpy.savez(best_opt_file_name, **best_optp)

    return valid_err
Example #47
0
def make_training_functions(cfg, model):
    l_out = model['l_out']
    batch_index = T.iscalar('batch_index')
    # bct01
    X = T.TensorType('float32', [False] * 5)('X')
    y = T.TensorType('int32', [False] * 1)('y')
    out_shape = lasagne.layers.get_output_shape(l_out)
    #log.info('output_shape = {}'.format(out_shape))

    batch_slice = slice(batch_index * cfg['batch_size'],
                        (batch_index + 1) * cfg['batch_size'])
    out = lasagne.layers.get_output(l_out, X)
    dout = lasagne.layers.get_output(l_out, X, deterministic=True)

    params = lasagne.layers.get_all_params(l_out)
    l2_norm = lasagne.regularization.regularize_network_params(
        l_out, lasagne.regularization.l2)
    if isinstance(cfg['learning_rate'], dict):
        learning_rate = theano.shared(np.float32(cfg['learning_rate'][0]))
    else:
        learning_rate = theano.shared(np.float32(cfg['learning_rate']))

    softmax_out = T.nnet.softmax(out)
    loss = T.cast(T.mean(T.nnet.categorical_crossentropy(softmax_out, y)),
                  'float32')
    pred = T.argmax(dout, axis=1)
    error_rate = T.cast(T.mean(T.neq(pred, y)), 'float32')

    reg_loss = loss + cfg['reg'] * l2_norm
    updates = lasagne.updates.momentum(reg_loss, params, learning_rate,
                                       cfg['momentum'])

    X_shared = lasagne.utils.shared_empty(5, dtype='float32')
    y_shared = lasagne.utils.shared_empty(1, dtype='float32')

    dout_fn = theano.function([X], dout)
    pred_fn = theano.function([X], pred)

    update_iter = theano.function([batch_index],
                                  reg_loss,
                                  updates=updates,
                                  givens={
                                      X: X_shared[batch_slice],
                                      y: T.cast(y_shared[batch_slice],
                                                'int32'),
                                  })

    error_rate_fn = theano.function([batch_index],
                                    error_rate,
                                    givens={
                                        X: X_shared[batch_slice],
                                        y: T.cast(y_shared[batch_slice],
                                                  'int32'),
                                    })
    tfuncs = {
        'update_iter': update_iter,
        'error_rate': error_rate_fn,
        'dout': dout_fn,
        'pred': pred_fn,
    }
    tvars = {
        'X': X,
        'y': y,
        'X_shared': X_shared,
        'y_shared': y_shared,
        'batch_slice': batch_slice,
        'batch_index': batch_index,
        'learning_rate': learning_rate,
    }
    return tfuncs, tvars
Example #48
0
File: utils.py Project: ehfo0/DVAE
def prior_z2(samples):
    return c * T.cast(samples.shape[2], 'float32') / 2 - T.sum(T.sqr(samples),
                                                               axis=2) / 2
Example #49
0
 def log_likelihood_sym(self, x_var, dist_info_vars):
     probs = dist_info_vars["prob"]
     # Assume layout is N * A
     return TT.log(TT.sum(probs * TT.cast(x_var, 'float32'), axis=-1) + TINY)
Example #50
0
def evaluate(
        dim_word=620,  # word vector dimensionality
        dim=1000,  # the number of LSTM units
        encoder='gru',
        decoder='gru_cond',
        hiero=None,
        decay_c=0.,
        alpha_c=0.,
        diag_c=0.,
        lrate=0.01,
        n_words_src=20000,
        n_words=20000,
        maxlen=100,  # maximum length of the description
        optimizer='adadelta',
        batch_size=128,
        valid_batch_size=128,  # Validation and test batch size
        saveto='./ckt/',
        dataset='data_iterator',
        dictionary='',  # word dictionary
        dictionary_src='',  # word dictionary
        use_dropout=False,
        model=False,
        correlation_coeff=0.1,
        clip_c=1.,
        dataset_='opensubs',
        use_context=False,
        dataset_size=-1,
        perplexity=True,
        BLEU=True):

    model_options = locals().copy()
    # Reload previous saved options
    if model:
        with open('{}.npz.pkl'.format(model), 'rb') as f:
            model_options = pkl.load(f)
            for k, v in model_options.items():
                if (k == 'dim_word' or k == 'dim' or k == 'encoder'
                        or k == 'decoder' or k == 'n_words_src'
                        or k == 'n_words' or k == 'optimizer' or k == 'dataset'
                        or k == 'dictionary' or k == 'dictionary_src'
                        or k == 'dataset_' or k == 'use_context'
                        or k == 'dim_context' or k == 'dataset_size'):
                    locals()[k] = v

                if k not in locals().keys():
                    locals()[k] = v
    else:
        raise ValueError('No model specified')

    # ===================
    # LOAD DICTIONARIES
    # ===================
    if dictionary:
        with open(dictionary, 'rb') as f:
            word_dict = pkl.load(f)
    else:
        # Assume dictionary is in the same folder as data
        if dataset_ == 'opensubs':
            dictionary = './data/OpenSubsDS/source_train_dict.pkl'
        elif dataset_ == 'ubuntu':
            dictionary = './data/UbuntuDS/source_train_dict.pkl'
        else:
            raise ValueError('No dictionary specified.')

        with open(dictionary, 'rb') as f:
            word_dict = pkl.load(f)

    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk

    if dictionary_src:
        with open(dictionary_src, 'rb') as f:
            word_dict_src = pkl.load(f)
    else:
        # Assume dictionary is in the same folder as data
        if dataset_ == 'opensubs':
            dictionary_src = './data/OpenSubsDS/source_train_dict.pkl'
        elif dataset_ == 'ubuntu':
            dictionary_src = './data/UbuntuDS/source_train_dict.pkl'
        else:
            raise ValueError('No dictionary specified.')

        with open(dictionary_src, 'rb') as f:
            word_dict_src = pkl.load(f)

    word_idict_src = dict()
    for kk, vv in word_dict_src.iteritems():
        word_idict_src[vv] = kk

    # =======================
    # LOAD MODEL PARAMETERS
    # =======================

    print 'Loading data...'
    load_data, prepare_data = get_dataset(dataset)

    if dataset_ == 'opensubs':
        train, valid, test = load_data(train_batch_size=batch_size,
                                       val_batch_size=valid_batch_size,
                                       test_batch_size=valid_batch_size,
                                       use_context=use_context,
                                       dataset_size=dataset_size)
    elif dataset_ == 'ubuntu':
        train, valid, test = load_data(
            train_source_path='./data/UbuntuDS/source_train_idx',
            train_target_path='./data/UbuntuDS/target_train_idx',
            validation_source_path='./data/UbuntuDS/source_val_idx',
            validation_target_path='./data/UbuntuDS/target_val_idx',
            test_source_path='./data/UbuntuDS/source_test_idx',
            test_target_path='./data/UbuntuDS/target_test_idx',
            train_batch_size=batch_size,
            val_batch_size=valid_batch_size,
            test_batch_size=valid_batch_size,
            use_context=use_context,
            context_path={
                'train': './data/UbuntuDS/context_train_idx',
                'validation': './data/UbuntuDS/context_val_idx',
                'test': './data/UbuntuDS/context_test_idx'
            },
            dataset_size=dataset_size)

    print 'Building model...'
    params = init_params(model_options)
    # reload parameters
    if model:
        params = load_params(model, params)
    else:
        raise ValueError('No model specified')

    tparams = init_tparams(params)

    trng, use_noise, x, x_mask, y, y_mask, conv_context, conv_context_mask, opt_ret, cost = build_model(
        tparams, model_options)

    if use_context:
        inps = [x, x_mask, y, y_mask, conv_context, conv_context_mask]
    else:
        inps = [x, x_mask, y, y_mask]

    # theano.printing.debugprint(cost.mean(), file=open('cost.txt', 'w'))

    print 'Buliding sampler...'
    f_init, f_next = build_sampler(tparams, model_options, trng)

    # Before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    history_errs = []
    # reload history
    if model and os.path.exists(model):
        history_errs = list(numpy.load(model)['history_errs'])
    best_p = None
    bad_count = 0

    # after any regularizer
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    uidx = 0
    estop = False
    save_turn = 0
    ########################
    # Main evaluation loop
    ########################
    if perplexity:
        print('Evaluating on train')
        # train_err, train_perplexity = prediction_scores(f_log_probs,
        #                                                 prepare_data,
        #                                                 model_options,
        #                                                 train)
        # print('Train Cost: {} Train Perplexity: {}'.format(train_err, train_perplexity))

        print('Evaluating on validation')
        valid_err, valid_perplexity = prediction_scores(
            f_log_probs, prepare_data, model_options, valid)
        print('Valid Cost: {} Valid Perplexity: {}'.format(
            valid_err, valid_perplexity))

        print('Evaluating on test')
        test_err, test_perplexity = prediction_scores(f_log_probs,
                                                      prepare_data,
                                                      model_options, test)
        print('Test Cost: {} Test Perplexity: {}'.format(
            test_err, test_perplexity))

    stochastic = False
    if BLEU:
        references = []
        hypotheses = []
        for x, y, in valid:

            references.append([[str(i) for i in y]])

            sample, score = gen_sample(tparams,
                                       f_init,
                                       f_next,
                                       x[:, None],
                                       model_options,
                                       trng=trng,
                                       k=1,
                                       maxlen=30,
                                       stochastic=stochastic,
                                       argmax=True)

            hypotheses.append([str(i) for i in sample])

        valid_BLEU = corpus_bleu(references, hypotheses)
        print('Validation BLEU: '.format(valid_BLEU))

        references = []
        hypotheses = []

        for x, y, conv_context in test:

            references.append([[str(i) for i in y]])

            sample, score = gen_sample(tparams,
                                       f_init,
                                       f_next,
                                       x[:, None],
                                       model_options,
                                       trng=trng,
                                       k=1,
                                       maxlen=30,
                                       stochastic=stochastic,
                                       argmax=True)

            hypotheses.append([str(i) for i in sample])

        test_BLEU = corpus_bleu(references, hypotheses)
        print('Test BLEU: '.format(test_BLEU))

    for i, x in enumerate(source_utterances):
        stochastic = False
        sample, score = gen_sample(tparams,
                                   f_init,
                                   f_next,
                                   x[:, None],
                                   model_options,
                                   trng=trng,
                                   k=1,
                                   maxlen=30,
                                   stochastic=stochastic,
                                   argmax=True)

        print('Source {}: '.format(i) + print_utterance(x, word_idict))
        if stochastic:
            ss = sample
        else:
            score = score / numpy.array([len(s) for s in sample])
            ss = sample[score.argmin()]
        print('Sample {}:'.format(i) + print_utterance(ss, word_idict))
Example #51
0
    def setup(self,
              params,
              gparams,
              shapes=None,
              max_norm=5.0,
              lr=0.01,
              eps=1e-6,
              rho=0.95,
              method="ADADELTA",
              beta=0.0,
              count=None,
              weight_l2=0):
        # Setup only once
        assert not self.updates

        if not shapes:
            shapes = params

        if not count:
            count = T.constant(1, dtype=FLOATX)
        else:
            count = T.cast(count, FLOATX)

        gcache = [
            theano.shared(np.zeros_like(param.get_value(borrow=True),
                                        dtype=FLOATX),
                          name="gcache_%s" % param.name) for param in shapes
        ]
        gcache_mean = [g / self.batch_counter for g in gcache]

        optimize_updates = optimize_parameters(params,
                                               gcache_mean,
                                               shapes,
                                               max_norm,
                                               lr,
                                               eps,
                                               rho,
                                               method,
                                               beta,
                                               gsum_regularization=0.0001,
                                               weight_l2=weight_l2,
                                               clip=self.clip)
        self.updates.extend(optimize_updates)
        self.caches.extend(gcache)

        if self.realtime:
            # Realtime update
            needs_update = self.batch_counter >= T.constant(self.batch_size)
            update_dict = OrderedDict()
            for param, update_val in optimize_updates:
                update_dict[param] = ifelse(needs_update, update_val, param)
            for cache, g in zip(gcache, gparams):
                update_dict[cache] = ifelse(needs_update, g, cache + g)
            update_dict[self.batch_counter] = ifelse(
                needs_update, count, self.batch_counter + count)
            return update_dict.items()

        else:
            # Manual update, perhaps in the end of one iteration
            gcache_updates = [(c, c + g) for c, g in zip(gcache, gparams)] + [
                (self.batch_counter, self.batch_counter + count)
            ]
            return gcache_updates
Example #52
0
            for p, u in updates]


if __name__ == "__main__":
    P = Parameters()
    extract, _ = model.build(P, "vrnn")
    X = T.tensor3('X')
    l = T.ivector('l')
    [Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std] = extract(X, l)

    parameters = P.values()
    batch_cost = model.cost(X, Z_prior_mean, Z_prior_std, Z_mean, Z_std,
                            X_mean, X_std, l)
    print "Calculating gradient..."
    print parameters
    batch_size = T.cast(X.shape[1], 'float32')

    gradients = T.grad(batch_cost, wrt=parameters)
    gradients = [g / batch_size for g in gradients]
    gradients = clip(5, parameters, gradients)

    P_learn = Parameters()
    updates = updates.adam(parameters,
                           gradients,
                           learning_rate=0.00025,
                           P=P_learn)
    updates = normalise_weights(updates)

    print "Compiling..."
    train = theano.function(
        inputs=[X, l],
Example #53
0
def run_cnn(exp_name,
            dataset,
            embedding,
            log_fn,
            perf_fn,
            emb_dm=100,
            batch_size=100,
            filter_hs=[1, 2, 3],
            hidden_units=[200, 100, 11],
            type_hidden_units=[200, 100, 6],
            dropout_rate=0.5,
            shuffle_batch=True,
            n_epochs=300,
            lr_decay=0.95,
            activation=ReLU,
            sqr_norm_lim=9,
            non_static=True,
            print_freq=5,
            sen_reg=False,
            L2=False):
    """
    Train and Evaluate CNN event encoder model
    :dataset: list containing three elements[(train_x, train_y), 
            (valid_x, valid_y), (test_x, test_y)]
    :embedding: word embedding with shape (|V| * emb_dm)
    :filter_hs: filter height for each paralle cnn layer
    :dropout_rate: dropout rate for full connected layers
    :n_epochs: the max number of iterations
    
    """
    start_time = timeit.default_timer()
    rng = np.random.RandomState(1234)

    input_height = len(dataset[0][0][0][0])
    num_sens = len(dataset[0][0][0])
    print "--input height ", input_height
    input_width = emb_dm
    num_maps = hidden_units[0]

    ###################
    # start snippet 1 #
    ###################
    print "start to construct the model ...."
    x = T.tensor3("x")
    type_y = T.ivector("y_type")
    pop_y = T.ivector("y_pop")

    words = shared(value=np.asarray(embedding, dtype=theano.config.floatX),
                   name="embedding",
                   borrow=True)

    # define function to keep padding vector as zero
    zero_vector_tensor = T.vector()
    zero_vec = np.zeros(input_width, dtype=theano.config.floatX)
    set_zero = function([zero_vector_tensor],
                        updates=[(words,
                                  T.set_subtensor(words[0, :],
                                                  zero_vector_tensor))])

    layer0_input = words[T.cast(x.flatten(), dtype="int32")].reshape(
        (x.shape[0] * x.shape[1], 1, x.shape[2], emb_dm))

    #########################
    # Construct Sen Vec #####
    #########################
    conv_layers = []
    filter_shape = (num_maps, 1, filter_hs[0], emb_dm)
    pool_size = (input_height - filter_hs[0] + 1, 1)
    conv_layer = nn.ConvPoolLayer(rng,
                                  input=layer0_input,
                                  input_shape=None,
                                  filter_shape=filter_shape,
                                  pool_size=pool_size,
                                  activation=activation)

    # make the sentence vector maxtrix
    sen_vecs = conv_layer.output.reshape((x.shape[0] * x.shape[1], num_maps))
    conv_layers.append(conv_layer)

    ########################
    ## Task 1: populaiton###
    ########################
    pop_layer_sizes = zip(hidden_units, hidden_units[1:])
    pop_layer_input = sen_vecs
    pop_drop_input = sen_vecs
    pop_hidden_outs = []
    pop_drop_outs = []
    pop_hidden_layers = []
    pop_drop_layers = []
    droprate = 0.5
    for layer_size in pop_layer_sizes[:-1]:
        U_value = np.random.random(layer_size).astype(theano.config.floatX)
        b_value = np.zeros((layer_size[-1], ), dtype=theano.config.floatX)

        U = theano.shared(U_value, borrow=True, name="U")
        b = theano.shared(b_value, borrow=True, name="b")

        pop_hidden_layer = nn.HiddenLayer(rng, pop_layer_input, layer_size[0],
                                          layer_size[1], ReLU,
                                          U * (1 - droprate), b)
        pop_drop_hidden_layer = nn.DropoutHiddenLayer(rng, pop_drop_input,
                                                      layer_size[0],
                                                      layer_size[1], ReLU,
                                                      droprate, U, b)

        pop_hidden_layers.append(pop_hidden_layer)
        pop_drop_layers.append(pop_drop_hidden_layer)

        pop_hidden_out = pop_hidden_layer.output
        pop_drop_out = pop_drop_hidden_layer.output

        pop_layer_input = pop_hidden_out
        pop_drop_input = pop_drop_out

        pop_hidden_outs.append(pop_hidden_out)
        pop_drop_outs.append(pop_drop_out)

    # construct pop classifier
    n_in, n_out = pop_layer_sizes[-1]
    W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX)
    b_value = np.zeros((n_out, ), dtype=theano.config.floatX)

    pop_W = theano.shared(W_value, borrow=True, name="pop_W")
    pop_b = theano.shared(b_value, borrow=True, name="pop_b")

    pop_act = T.dot(pop_hidden_outs[-1], pop_W * (1 - droprate)) + pop_b
    pop_drop_act = T.dot(pop_drop_outs[-1], pop_W) + pop_b

    sen_pop_probs = T.nnet.softmax(pop_act)
    sen_drop_pop_probs = T.nnet.softmax(pop_drop_act)

    pop_probs = T.mean(sen_pop_probs.reshape((x.shape[0], x.shape[1], n_out)),
                       axis=1)
    pop_drop_probs = T.mean(sen_drop_pop_probs.reshape(
        (x.shape[0], x.shape[1], n_out)),
                            axis=1)

    pop_y_pred = T.argmax(pop_probs, axis=1)
    pop_drop_y_pred = T.argmax(pop_drop_probs, axis=1)

    pop_neg_loglikelihood = -T.mean(
        T.log(pop_probs)[T.arange(pop_y.shape[0]), pop_y])
    pop_drop_neg_loglikelihood = -T.mean(
        T.log(pop_drop_probs)[T.arange(pop_y.shape[0]), pop_y])

    pop_errors = T.mean(T.neq(pop_y_pred, pop_y))
    pop_errors_detail = T.neq(pop_y_pred, pop_y)

    pop_cost = pop_neg_loglikelihood
    pop_drop_cost = pop_drop_neg_loglikelihood

    ########################
    ## Task 1: event type###
    ########################
    type_layer_sizes = zip(type_hidden_units, type_hidden_units[1:])
    type_layer_input = sen_vecs
    type_drop_input = sen_vecs
    type_hidden_outs = []
    type_drop_outs = []
    type_hidden_layers = []
    type_drop_layers = []
    droprate = 0.5
    for layer_size in type_layer_sizes[:-1]:
        U_value = np.random.random(layer_size).astype(theano.config.floatX)
        b_value = np.zeros((layer_size[-1], ), dtype=theano.config.floatX)

        U = theano.shared(U_value, borrow=True, name="U")
        b = theano.shared(b_value, borrow=True, name="b")

        type_hidden_layer = nn.HiddenLayer(rng, type_layer_input,
                                           layer_size[0], layer_size[1], ReLU,
                                           U * (1 - droprate), b)
        type_drop_hidden_layer = nn.DropoutHiddenLayer(rng, type_drop_input,
                                                       layer_size[0],
                                                       layer_size[1], ReLU,
                                                       droprate, U, b)

        type_hidden_layers.append(type_hidden_layer)
        type_drop_layers.append(type_drop_hidden_layer)

        type_hidden_out = type_hidden_layer.output
        type_drop_out = type_drop_hidden_layer.output

        type_layer_input = type_hidden_out
        type_drop_input = type_drop_out

        type_hidden_outs.append(type_hidden_out)
        type_drop_outs.append(type_drop_out)

    # construct pop classifier
    n_in, n_out = type_layer_sizes[-1]
    W_value = np.random.random((n_in, n_out)).astype(theano.config.floatX)
    b_value = np.zeros((n_out, ), dtype=theano.config.floatX)

    type_W = theano.shared(W_value, borrow=True, name="pop_W")
    type_b = theano.shared(b_value, borrow=True, name="pop_b")

    type_act = T.dot(type_hidden_outs[-1], type_W * (1 - droprate)) + type_b
    type_drop_act = T.dot(type_drop_outs[-1], type_W) + type_b

    #type_probs = T.nnet.softmax(type_max_act)
    #type_drop_probs = T.nnet.softmax(type_drop_max_act)

    sen_type_probs = T.nnet.softmax(type_act)
    sen_drop_type_probs = T.nnet.softmax(type_drop_act)

    type_probs = T.mean(sen_type_probs.reshape(
        (x.shape[0], x.shape[1], n_out)),
                        axis=1)
    type_drop_probs = T.mean(sen_drop_type_probs.reshape(
        (x.shape[0], x.shape[1], n_out)),
                             axis=1)

    type_y_pred = T.argmax(type_probs, axis=1)
    type_drop_y_pred = T.argmax(type_drop_probs, axis=1)

    type_neg_loglikelihood = -T.mean(
        T.log(type_probs)[T.arange(type_y.shape[0]), type_y])
    type_drop_neg_loglikelihood = -T.mean(
        T.log(type_drop_probs)[T.arange(type_y.shape[0]), type_y])

    type_errors = T.mean(T.neq(type_y_pred, type_y))
    type_errors_detail = T.neq(type_y_pred, type_y)

    type_cost = type_neg_loglikelihood
    type_drop_cost = type_drop_neg_loglikelihood

    ##################################
    # Collect all the parameters #####
    ##################################
    params = []
    # convolution layer params
    for conv_layer in conv_layers:
        params += conv_layer.params

    # params for population task
    for layer in pop_drop_layers:
        params += layer.params

    params.append(pop_W)
    params.append(pop_b)

    # params for event type task
    for layer in type_drop_layers:
        params += layer.params

    params.append(type_W)
    params.append(type_b)

    if non_static:
        params.append(words)

    total_cost = pop_cost + type_cost
    total_drop_cost = pop_drop_cost + type_drop_cost

    if L2:
        l2_norm = 0.1 * T.sum(pop_W**2) + 0.1 * T.sum(type_W**2)
        for drop_layer in type_drop_layers:
            l2_norm += 0.1 * T.sum(drop_layer.W**2)

        for drop_layer in pop_drop_layers:
            l2_norm += 0.1 * T.sum(drop_layer.W**2)
        total_cost += l2_norm
        total_drop_cost += l2_norm

    total_grad_updates = sgd_updates_adadelta(params, total_drop_cost,
                                              lr_decay, 1e-6, sqr_norm_lim)

    total_preds = [pop_y_pred, type_y_pred]
    total_errors_details = [pop_errors_detail, type_errors_detail]
    total_out = total_preds + total_errors_details

    #####################
    # Construct Dataset #
    #####################
    print "Copy data to GPU and constrct train/valid/test func"
    np.random.seed(1234)

    train_x, train_pop_y, train_type_y = shared_dataset(dataset[0])
    valid_x, valid_pop_y, valid_type_y = shared_dataset(dataset[1])
    test_x, test_pop_y, test_type_y = shared_dataset(dataset[2])

    n_train_batches = int(np.ceil(1.0 * len(dataset[0][0]) / batch_size))
    n_valid_batches = int(np.ceil(1.0 * len(dataset[1][0]) / batch_size))
    n_test_batches = int(np.ceil(1.0 * len(dataset[2][0]) / batch_size))

    #####################
    # Train model func #
    #####################
    index = T.iscalar()
    train_func = function(
        [index],
        total_drop_cost,
        updates=total_grad_updates,
        givens={
            x: train_x[index * batch_size:(index + 1) * batch_size],
            pop_y: train_pop_y[index * batch_size:(index + 1) * batch_size],
            type_y: train_type_y[index * batch_size:(index + 1) * batch_size]
        })

    valid_train_func = function(
        [index],
        total_drop_cost,
        updates=total_grad_updates,
        givens={
            x: valid_x[index * batch_size:(index + 1) * batch_size],
            pop_y: valid_pop_y[index * batch_size:(index + 1) * batch_size],
            type_y: valid_type_y[index * batch_size:(index + 1) * batch_size]
        })

    test_pred_detail = function(
        [index],
        total_out,
        givens={
            x: test_x[index * batch_size:(index + 1) * batch_size],
            pop_y: test_pop_y[index * batch_size:(index + 1) * batch_size],
            type_y: test_type_y[index * batch_size:(index + 1) * batch_size]
        })

    # apply early stop strategy
    patience = 100
    patience_increase = 2
    improvement_threshold = 1.005

    n_valid = len(dataset[1][0])
    n_test = len(dataset[2][0])

    epoch = 0
    best_params = None
    best_validation_score = 0.
    test_perf = 0

    done_loop = False

    log_file = open(log_fn, 'w')

    print "Start to train the model....."

    total_score = 0.0
    while (epoch < n_epochs) and not done_loop:
        start_time = timeit.default_timer()
        epoch += 1
        costs = []
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            cost_epoch = train_func(minibatch_index)
            costs.append(cost_epoch)
            set_zero(zero_vec)

        # do validatiovalidn
        valid_cost = [
            valid_train_func(i)
            for i in np.random.permutation(xrange(n_valid_batches))
        ]

        if epoch % print_freq == 0:
            # do test
            pop_preds = []
            type_preds = []
            pop_errors = []
            type_errors = []
            pop_sens = []
            type_sens = []

            for i in xrange(n_test_batches):
                test_pop_pred, test_type_pred, test_pop_error, test_type_error = test_pred_detail(
                    i)

                pop_preds.append(test_pop_pred)
                type_preds.append(test_type_pred)
                pop_errors.append(test_pop_error)
                type_errors.append(test_type_error)

            pop_preds = np.concatenate(pop_preds)
            type_preds = np.concatenate(type_preds)
            pop_errors = np.concatenate(pop_errors)
            type_errors = np.concatenate(type_errors)

            pop_perf = 1 - np.mean(pop_errors)
            type_perf = 1 - np.mean(type_errors)

            # dumps the predictions and the choosed sentences
            with open(
                    os.path.join(perf_fn,
                                 "%s_%d.pop_pred" % (exp_name, epoch)),
                    'w') as epf:
                for p in pop_preds:
                    epf.write("%d\n" % int(p))

            with open(
                    os.path.join(perf_fn,
                                 "%s_%d.type_pred" % (exp_name, epoch)),
                    'w') as epf:
                for p in type_preds:
                    epf.write("%d\n" % int(p))

            message = "Epoch %d test pop perf %f, type perf %f, training_cost %f" % (
                epoch, pop_perf, type_perf, np.mean(costs))
            print message
            log_file.write(message + "\n")
            log_file.flush()

            if (pop_perf + type_perf) > total_score:
                total_score = pop_perf + type_perf
                # save the model
                model_name = os.path.join(
                    perf_fn, "%s_%d.best_model" % (exp_name, epoch))
                with open(model_name, 'wb') as mn:
                    for param in params:
                        cPickle.dump(param.get_value(), mn)

        end_time = timeit.default_timer()
        print "Finish one iteration using %f m" % (
            (end_time - start_time) / 60.)

    # output the final model params
    print "Output the final model"
    model_name = os.path.join(perf_fn, "%s_%d.final_model" % (exp_name, epoch))
    with open(model_name, 'wb') as mn:
        for param in params:
            cPickle.dump(param.get_value(), mn)

    log_file.flush()
    log_file.close()
Example #54
0
 def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
     old_prob_var = old_dist_info_vars["prob"]
     new_prob_var = new_dist_info_vars["prob"]
     x_var = TT.cast(x_var, 'float32')
     # Assume layout is N * A
     return (TT.sum(new_prob_var * x_var, axis=-1) + TINY) / (TT.sum(old_prob_var * x_var, axis=-1) + TINY)
Example #55
0
    def step(char_lm1, char_l, trans_probs_l):
        """Probability of going from char_lm1 to char_l using trans_probs_l tensor"""
        char_lm1 = T.cast(char_lm1, 'int32')
        char_l = T.cast(char_l, 'int32')

        return trans_probs_l[T.arange(N), char_lm1, char_l]  # N
Example #56
0
import theano
import theano.tensor as T
import numpy as np

if __name__ == '__main__':
    x = np.asarray([[1, 2], [1, 2]], dtype='float32')
    lens = x.shape[0]
    y = np.zeros(lens, dtype='float32')
    z = np.full(lens, 1, dtype='float32')
    ll = np.asarray([1, 0], dtype='int32')
    lll = np.asarray([[1, 2, 5], [2, 3, 4]], dtype='float32')

    zero = T.vector('zero', dtype='float32')
    margin = T.vector('margin', dtype='float32')
    cos12 = T.matrix(dtype='float32')
    label = T.vector(dtype='int32')
    # T.reshape(label,(label.shape[0],1))

    diff = T.cast(T.maximum(zero, margin - cos12[:, label]), dtype='float32')

    cost = T.sum(diff, acc_dtype='float32')

    f = theano.function([cos12, zero, margin, label], diff)

    print f(x, y, z, ll)

    print x[:, 0]
Example #57
0
def train_qacnn(
        datasets,
        U,  # pre-trained word embeddings
        filter_hs=[2],  # filter width
        hidden_units=[100, 2],
        shuffle_batch=True,
        n_epochs=25,
        lam=0,
        batch_size=20,
        lr_decay=0.95,  # for AdaDelta
        sqr_norm_lim=9):  # for optimization
    """
    return: a list of dicts of lists, each list contains (ansId, groundTruth, prediction) for a question
    """
    rng = np.random.RandomState(3435)
    img_h = (len(datasets[0][0]) - 3) / 2
    img_w = U.shape[1]
    lsize, rsize = img_h, img_h
    filter_w = img_w
    feature_maps = hidden_units[0]
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1))
    parameters = [("image shape", img_h, img_w),
                  ("filter shape", filter_shapes),
                  ("hidden_units", hidden_units), ("batch_size", batch_size),
                  ("lambda", lam), ("learn_decay", lr_decay),
                  ("sqr_norm_lim", sqr_norm_lim),
                  ("shuffle_batch", shuffle_batch)]
    print parameters

    # define model architecture
    index = T.lscalar()
    lx = T.matrix('lx')
    rx = T.matrix('rx')
    y = T.ivector('y')
    Words = theano.shared(value=U, name="Words")
    llayer0_input = Words[T.cast(lx.flatten(), dtype="int32")].reshape(
        (lx.shape[0], 1, lx.shape[1],
         Words.shape[1]))  # input: word embeddings of the mini batch
    rlayer0_input = Words[T.cast(rx.flatten(), dtype="int32")].reshape(
        (rx.shape[0], 1, rx.shape[1],
         Words.shape[1]))  # input: word embeddings of the mini batch

    conv_layers = []  # layer number = filter number
    llayer1_inputs = []  # layer number = filter number
    rlayer1_inputs = []  # layer number = filter number
    for i in xrange(len(filter_hs)):
        filter_shape = filter_shapes[i]
        pool_size = pool_sizes[i]
        conv_layer = QALeNetConvPoolLayer(rng,
                                          linp=llayer0_input,
                                          rinp=rlayer0_input,
                                          filter_shape=filter_shape,
                                          poolsize=pool_size)
        llayer1_input = conv_layer.loutput.flatten(2)
        rlayer1_input = conv_layer.routput.flatten(2)
        conv_layers.append(conv_layer)
        llayer1_inputs.append(llayer1_input)
        rlayer1_inputs.append(rlayer1_input)
    llayer1_input = T.concatenate(
        llayer1_inputs, 1)  # concatenate representations of different filters
    rlayer1_input = T.concatenate(
        rlayer1_inputs, 1)  # concatenate representations of different filters
    hidden_units[0] = feature_maps * len(filter_hs)

    classifier = BilinearLR(llayer1_input, rlayer1_input, hidden_units[0],
                            hidden_units[0])
    params = classifier.params
    for conv_layer in conv_layers:
        params += conv_layer.params
    L2_sqr = 0.
    for param in params:
        L2_sqr += (param**2).sum()
    cost = classifier.get_cost(y) + lam * L2_sqr
    grad_updates = sgd_updates_adadelta(params, cost, lr_decay, 1e-6,
                                        sqr_norm_lim)

    # shuffle dataset and assign to mini batches. if dataset size is not a multiple of mini batches, replicate
    # extra data (at random)
    np.random.seed(3435)
    if datasets[0].shape[0] % batch_size > 0:
        extra_data_num = batch_size - datasets[0].shape[0] % batch_size
        train_set = np.random.permutation(datasets[0])
        extra_data = train_set[:extra_data_num]
        new_data = np.append(datasets[0], extra_data, axis=0)
    else:
        new_data = datasets[0]
    new_data = np.random.permutation(new_data)
    n_train_batches = new_data.shape[0] / batch_size

    train_set, train_set_orig, val_set, test_set = new_data, datasets[
        0], datasets[1], datasets[2]
    train_set_lx = theano.shared(np.asarray(train_set[:, :lsize],
                                            dtype=theano.config.floatX),
                                 borrow=True)
    train_set_rx = theano.shared(np.asarray(train_set[:, lsize:lsize + rsize],
                                            dtype=theano.config.floatX),
                                 borrow=True)
    train_set_y = theano.shared(np.asarray(train_set[:, -1], dtype="int32"),
                                borrow=True)

    train_set_lx_orig, train_set_rx_orig, train_set_qid_orig, train_set_aid_orig, train_set_y_orig = train_set_orig[:, :lsize], train_set_orig[:, lsize:lsize + rsize], np.asarray(
        train_set_orig[:, -3], dtype="int32"), np.asarray(
            train_set_orig[:, -2],
            dtype="int32"), np.asarray(train_set_orig[:, -1], dtype="int32")
    val_set_lx, val_set_rx, val_set_qid, val_set_aid, val_set_y = val_set[:, :lsize], val_set[:, lsize:lsize + rsize], np.asarray(
        val_set[:, -3],
        dtype="int32"), np.asarray(val_set[:, -2],
                                   dtype="int32"), np.asarray(val_set[:, -1],
                                                              dtype="int32")
    test_set_lx, test_set_rx, test_set_qid, test_set_aid, test_set_y = test_set[:, :lsize], test_set[:, lsize:lsize + rsize], np.asarray(
        test_set[:, -3],
        dtype="int32"), np.asarray(test_set[:, -2],
                                   dtype="int32"), np.asarray(test_set[:, -1],
                                                              dtype="int32")

    train_model = theano.function(
        [index],
        cost,
        updates=grad_updates,
        givens={
            lx: train_set_lx[index * batch_size:(index + 1) * batch_size],
            rx: train_set_rx[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    test_lpred_layers = []
    test_rpred_layers = []
    test_llayer0_input = Words[T.cast(lx.flatten(), dtype="int32")].reshape(
        (lx.shape[0], 1, img_h, Words.shape[1]))
    test_rlayer0_input = Words[T.cast(rx.flatten(), dtype="int32")].reshape(
        (rx.shape[0], 1, img_h, Words.shape[1]))
    for conv_layer in conv_layers:
        test_llayer0_output, test_rlayer0_output = conv_layer.predict(
            test_llayer0_input, test_rlayer0_input)
        test_lpred_layers.append(test_llayer0_output.flatten(2))
        test_rpred_layers.append(test_rlayer0_output.flatten(2))
    test_llayer1_input = T.concatenate(test_lpred_layers, 1)
    test_rlayer1_input = T.concatenate(test_rpred_layers, 1)
    test_y_pred = classifier.predict(test_llayer1_input, test_rlayer1_input)
    test_model = theano.function([lx, rx], test_y_pred)

    #start training over mini-batches
    print '... training'
    epoch = 0
    cost_epoch = 0
    train_preds_epos, dev_preds_epos, test_preds_epos = [], [], []
    while (epoch < n_epochs):
        epoch = epoch + 1
        total_cost = 0
        if shuffle_batch:
            for minibatch_index in np.random.permutation(
                    range(n_train_batches)):
                cost_epoch = train_model(minibatch_index)
                total_cost += cost_epoch
        else:
            for minibatch_index in xrange(n_train_batches):
                cost_epoch = train_model(minibatch_index)
                total_cost += cost_epoch
        print "epoch = %d, cost = %f" % (epoch, total_cost)

        train_preds, dev_preds, test_preds = defaultdict(list), defaultdict(
            list), defaultdict(list)
        ypred = test_model(train_set_lx_orig, train_set_rx_orig)
        for i, pr in enumerate(ypred):
            qid, aid, y = train_set_qid_orig[i], train_set_aid_orig[
                i], train_set_y_orig[i]
            train_preds[qid].append((aid, y, pr))
        ypred = test_model(val_set_lx, val_set_rx)
        for i, pr in enumerate(ypred):
            qid, aid, y = val_set_qid[i], val_set_aid[i], val_set_y[i]
            dev_preds[qid].append((aid, y, pr))
        ypred = test_model(test_set_lx, test_set_rx)
        for i, pr in enumerate(ypred):
            qid, aid, y = test_set_qid[i], test_set_aid[i], test_set_y[i]
            test_preds[qid].append((aid, y, pr))
        train_preds_epos.append(train_preds)
        dev_preds_epos.append(dev_preds)
        test_preds_epos.append(test_preds)
    return train_preds_epos, dev_preds_epos, test_preds_epos
Example #58
0
def viterbi(trans_probs):
    """Using the canvas C generate the most probable sequence

    using the usual viterbi algorithm and an input x as the observed
    data, generate the most probable latent sequence using the viterbi
    updates given the transition tensor trans_probs over all of the
    N given sentences

    :param trans_probs: N * max(L) * D * D tensor
    :return: characters and the probabilities for each of these
    """

    N = trans_probs.shape[0]
    D = trans_probs.shape[-1]

    # T1_0 has to be samples differently since it has no precedent character
    # to index the row in the transition tensor
    T1_0 = trans_probs[:, 0, 0]  # N * D matrix
    T2_0 = T.zeros((N, D))  # N * D matrix

    # forward step in viterbi algorithm
    def step_forward(trans_probs_l, T1_lm1):
        T1_l = T.max(T.shape_padright(T1_lm1) * trans_probs_l,
                     axis=1)  # N * D matrix
        T2_l = T.argmax(T.shape_padright(T1_lm1) * trans_probs_l,
                        axis=1)  # N * D matrix

        return T.cast(T1_l, 'float32'), T.cast(T2_l, 'float32')

    ([T1, T2], _) = theano.scan(
        step_forward,
        sequences=trans_probs[:, 1:].dimshuffle((1, 0, 2, 3)),
        outputs_info=[T1_0, None],
    )
    # (max(L)-1) * N * D tensors

    # concatenate initial sample with the rest to get full path
    T1 = T.concatenate([T.shape_padleft(T1_0), T1], axis=0)  # max(L) * N * D
    T2 = T.concatenate([T.shape_padleft(T2_0), T2], axis=0)  # max(L) * N * D

    char_L = T.cast(T.argmax(T1[-1], axis=1), 'float32')  # N

    # backward step in viterbi algorithm to find the actual sequence
    def step_backward(T2_lp1, char_lp1):
        char_l = T2_lp1[T.arange(N), T.cast(char_lp1, 'int32')]  # N

        return T.cast(char_l, 'float32')

    chars, _ = theano.scan(
        step_backward,
        sequences=T2[1:][::-1],
        outputs_info=[char_L],
    )
    # (max(L)-1) * N

    chars = chars[::-1]  # (max(L)-1) * N
    chars = T.concatenate([chars, T.shape_padleft(char_L)],
                          axis=0).T  # N * max(L)
    probs = get_probs(chars, trans_probs)  # N * max(L)

    return chars, probs  # N * max(L) and N * max(L)
Example #59
0
def train(learning_rate=0.1, n_epochs=100, batch_size=320, batch_type = 'fast',
                    mynet = 'one', representation='raw', momentum=0, history=0):

    rng = numpy.random.RandomState(42)

    trainP = 0.8
    validP = 0.1
    testP  = 0.1   
    
#    print "... Reading cached values ..."
#    (trainCumLengths,validCumLengths,testCumLengths,filenames) = pickle.load(open("results/5x5.cache",'r'))
    
    print "... Getting filenames ..."
    datasetMY = "../MC player/20kgames9"
    fn1 = readGame.getFilenames(datasetMY,1,0,1)[0]
    random.shuffle(fn1)    
    filenames = fn1
    n = len(filenames)
    print "... Learning set contains " + str(n) + " games"
    
    print "... Computing cumulative game lengths ..."
    trainNames = filenames[:int(trainP*n)]
    validNames = filenames[int(trainP*n):int(trainP*n+validP*n)]
    testNames  = filenames[int(trainP*n+validP*n):int(trainP*n+validP*n+testP*n)]
    
    random.shuffle(trainNames)
    
    trainCumLengths = readGame.getCumGameLengths(trainNames,ftype="game")
    validCumLengths = readGame.getCumGameLengths(validNames,ftype="game")
    testCumLengths = readGame.getCumGameLengths(testNames,ftype="game")
    
    fw = open("results/"+str(gs)+"x"+str(gs)+".cache","wb")
    pickle.dump((trainCumLengths,validCumLengths,testCumLengths,filenames),fw)
    fw.close()
    print "... Preprocessing initial batches ..."
    minn = batch_size / 10 +1
    temp = time.time()
    test_batch_x, test_batch_y = utils.shared_dataset(readGame.processGAMEs(testNames[:minn],representation,gs=gs),batch_size=batch_size,board_size=gs)
    train_batch_x, train_batch_y = utils.shared_dataset(readGame.processGAMEs(trainNames[:minn],representation,gs=gs),batch_size=batch_size,board_size=gs)
    valid_batch_x, valid_batch_y = utils.shared_dataset(readGame.processGAMEs(validNames[:minn],representation,gs=gs),batch_size=batch_size,board_size=gs)
    print "    average processing time per game: " + str((time.time()-temp)/18.0) + " seconds, per epoch: " + str(int((time.time()-temp)/18*n/60/60)) + " hours" 

    # compute number of minibatches for training, validation and testing
    n_train_batches = trainCumLengths[-1]
    n_valid_batches = validCumLengths[-1]
    n_test_batches =  testCumLengths[-1]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    iteration = T.lscalar()  # iteration number of a minibatch
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ishape = (gs, gs)  # this is the size of MNIST images

    fw = open("results/"+mynet+"_"+str(learning_rate)+"_"+".res","w")
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... Building the model ...'
   
    nc = 2 if representation=='raw' else 6  # if raw
    nc *= 1+history



       
    if mynet == "zero":
        layer0_input = x.reshape((batch_size, nc, gs, gs))
        layer0 = LogisticRegression(input=layer0_input.flatten(2), n_in=nc*gs*gs, n_out=gs*gs)
        cost = layer0.negative_log_likelihood(y)
    
        params = layer0.params

    if mynet == "one":
        nHiddens = 500
        layer1_input = x.reshape((batch_size, nc, gs, gs))
        layer1 = HiddenLayer(rng, input=layer1_input.flatten(2), n_in=nc * gs * gs,
                           n_out=nHiddens, activation=T.tanh)
        layer0 = LogisticRegression(input=layer1.output, n_in=nHiddens, n_out=gs*gs)
        cost = layer0.negative_log_likelihood(y)
    
        params = layer0.params + layer1.params
        
    # create a function to compute the mistakes that are made by the model
    test_model = theano.function([], layer0.errors(y),
             givens={
                x: test_batch_x,
                y: T.cast(test_batch_y, 'int32')})

    validate_model = theano.function([], layer0.errors(y),
             givens={
                x: valid_batch_x,
                y: T.cast(valid_batch_y, 'int32')})

    predictions = theano.function([], layer0.get_predictions(),
            givens={
                x: valid_batch_x})
                
    conditional_dist = theano.function([], layer0.get_conditional_dist(),
            givens={
                x: valid_batch_x})

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []
    #adjusted_rate = learning_rate - iteration*(learning_rate/(float(n_epochs) * n_train_batches))
    adjusted_rate = learning_rate if T.lt(iteration,3000*200) else 0.1*learning_rate
    
    for param_i, grad_i in zip(params, grads):#, prev_grad_i   , prevGrads):
        updates.append((param_i, param_i - adjusted_rate * grad_i))# - momentum * prev_grad_i))
    
    #for i,grad in enumerate(grads):
    #    updates.append((prevGrads[i], grad))
    
    train_model = theano.function([iteration], cost, updates=updates,
         givens={
            x: train_batch_x,
            y: T.cast(train_batch_y, 'int32')},on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... Training ...'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.999  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = 2000         # min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    stime = time.time()

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 500 == 0:
                print 'training @ iter = ', iter
                pickle.dump((updates,cost,layer0,test_model,predictions,conditional_dist),open("results/"+str(batch_size)+representation+str(history)+".model","w"))
            if iter ==5:
                print 'estimated train time per epoch = '+ str((time.time() - stime) * n_train_batches/60.0/iter/60.0) + " hours"
            ax,ay = getBatch(trainNames, minibatch_index, trainCumLengths, batch_size,representation,batchType=batch_type,history=history)
            train_batch_x.set_value(ax)
            train_batch_y.set_value(ay)
            cost_ij = train_model(iter)

            if (iter + 1) % validation_frequency == 0 or iter==5:

                # compute zero-one loss on validation set
                validation_losses = []
                for i in xrange(n_valid_batches):
                    vx,vy = getBatch(validNames, i, validCumLengths, batch_size,representation,batchType='fast',history=history)
                    valid_batch_x.set_value(vx)
                    valid_batch_y.set_value(vy)
                    validation_losses.append(validate_model())
                this_validation_loss = numpy.mean(validation_losses)
        
                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses=[]
                    for i in xrange(n_test_batches):
                        tx,ty = getBatch(testNames, i, testCumLengths, batch_size,representation,batchType='fast',history=history)
                        test_batch_x.set_value(tx)
                        test_batch_y.set_value(ty)
                        test_losses.append(test_model())
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

        #fw.write("Epoch "+str(epoch) + ": " +str((1-this_validation_loss)*100.)+ "%\n")
        pickle.dump((updates,cost,layer0,test_model,predictions,conditional_dist),open("results/"+str(batch_size)+representation+str(history)+".model","w"))
        
            #if patience <= iter:
            #    done_looping = True
            #    break

    fw.close()
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #60
0
    def step_backward(T2_lp1, char_lp1):
        char_l = T2_lp1[T.arange(N), T.cast(char_lp1, 'int32')]  # N

        return T.cast(char_l, 'float32')