Beispiel #1
0
def make_functions(input_size,
                   output_size,
                   mem_size,
                   mem_width,
                   hidden_sizes=[100]):

    start_time = time.time()

    input_seqs = T.btensor3('input_sequences')
    output_seqs = T.btensor3('output_sequences')

    P = Parameters()
    process = model.build(P, input_size, output_size, mem_size, mem_width,
                          hidden_sizes[0])
    outputs = process(T.cast(input_seqs, 'float32'))
    output_length = (input_seqs.shape[1] - 2) // 2

    Y = output_seqs[:, -output_length:, :-2]
    Y_hat = T.nnet.sigmoid(outputs[:, -output_length:, :-2])

    cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat, Y))
    bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2)

    params = P.values()

    cost = cross_entropy  # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params)

    print "Computing gradients",
    grads = T.grad(cost, wrt=params)
    grads = updates.clip_deltas(grads, np.float32(clip_length))

    print "Done. (%0.3f s)" % (time.time() - start_time)
    start_time = time.time()
    print "Compiling function",
    P_learn = Parameters()

    update_pairs = updates.rmsprop(params,
                                   grads,
                                   learning_rate=1e-4,
                                   P=P_learn)

    train = theano.function(
        inputs=[input_seqs, output_seqs],
        outputs=cross_entropy,
        updates=update_pairs,
    )

    test = theano.function(inputs=[input_seqs, output_seqs], outputs=bits_loss)

    print "Done. (%0.3f s)" % (time.time() - start_time)
    print P.parameter_count()
    return P, P_learn, train, test
Beispiel #2
0
def make_train(input_size,
               output_size,
               mem_size,
               mem_width,
               hidden_sizes=[100]):
    P = Parameters()
    ctrl = controller.build(P, input_size, output_size, mem_size, mem_width,
                            hidden_sizes)
    predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl)

    input_seq = T.matrix('input_sequence')
    output_seq = T.matrix('output_sequence')
    seqs = predict(input_seq)
    output_seq_pred = seqs[-1]
    cross_entropy = T.sum(T.nnet.binary_crossentropy(
        5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq),
                          axis=1)
    cost = T.sum(cross_entropy)  # + 1e-3 * l2
    params = P.values()
    grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)]

    train = theano.function(inputs=[input_seq, output_seq],
                            outputs=T.sum(cross_entropy),
                            updates=updates.adadelta(params, grads))

    return P, train
Beispiel #3
0
def make_train(input_size, output_size, mem_size, mem_width, hidden_size=100):
    P = Parameters()

    # Build controller. ctrl is a network that takes an external and read input
    # and returns the output of the network and its hidden layer
    ctrl = controller.build(P, input_size, output_size, mem_size, mem_width,
                            hidden_size)

    # Build model that predicts output sequence given input sequence
    predict = model.build(P, mem_size, mem_width, hidden_size, ctrl)

    input_seq = T.matrix('input_sequence')
    output_seq = T.matrix('output_sequence')
    [M, weights, output_seq_pred] = predict(input_seq)

    # Setup for adadelta updates
    cross_entropy = T.sum(T.nnet.binary_crossentropy(
        5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq),
                          axis=1)
    params = P.values()
    l2 = T.sum(0)
    for p in params:
        l2 = l2 + (p**2).sum()
    cost = T.sum(cross_entropy) + 1e-3 * l2
    # clip gradients
    grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)]

    train = theano.function(inputs=[input_seq, output_seq],
                            outputs=cost,
                            updates=updates.adadelta(params, grads))

    return P, train
Beispiel #4
0
def prepare_functions(input_size, hidden_size, latent_size, step_count,
                      batch_size, train_X, valid_X):
    P = Parameters()
    encode_decode = model.build(P,
                                input_size=input_size,
                                hidden_size=hidden_size,
                                latent_size=latent_size)
    P.W_decoder_input_0.set_value(P.W_decoder_input_0.get_value() * 10)

    X = T.matrix('X')
    step_count = 10
    parameters = P.values()

    cost_symbs = []
    for s in xrange(step_count):
        Z_means, Z_stds, alphas, \
            X_mean, log_pi_samples = encode_decode(X, step_count=s + 1)
        batch_recon_loss, log_p = model.recon_loss(X, X_mean, log_pi_samples)
        recon_loss = T.mean(batch_recon_loss, axis=0)
        reg_loss = T.mean(model.reg_loss(Z_means, Z_stds, alphas), axis=0)
        vlb = recon_loss + reg_loss
        corr = T.mean(T.eq(T.argmax(log_p, axis=0),
                           T.argmax(log_pi_samples, axis=0)),
                      axis=0)
        cost = cost_symbs.append(vlb)

    avg_cost = sum(cost_symbs) / step_count
    cost = avg_cost + 1e-3 * sum(T.sum(T.sqr(w)) for w in parameters)

    gradients = updates.clip_deltas(T.grad(cost, wrt=parameters), 5)

    print "Updated parameters:"
    pprint(parameters)
    idx = T.iscalar('idx')

    train = theano.function(
        inputs=[idx],
        outputs=[
            vlb, recon_loss, reg_loss,
            T.max(T.argmax(log_pi_samples, axis=0)), corr
        ],
        updates=updates.adam(parameters, gradients, learning_rate=1e-4),
        givens={X: train_X[idx * batch_size:(idx + 1) * batch_size]})

    validate = theano.function(inputs=[], outputs=vlb, givens={X: valid_X})

    sample = theano.function(inputs=[],
                             outputs=[
                                 X, X_mean,
                                 T.argmax(log_pi_samples, axis=0),
                                 T.exp(log_pi_samples)
                             ],
                             givens={X: valid_X[:10]})

    return train, validate, sample
Beispiel #5
0
def make_model(input_size=8,
               output_size=8,
               mem_size=128,
               mem_width=20,
               hidden_sizes=[100]):
    P = Parameters()
    ctrl = controller.build(P, input_size, output_size, mem_size, mem_width,
                            hidden_sizes)
    predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl)
    input_seq = T.matrix('input_sequence')
    [M_curr, weights, output] = predict(input_seq)

    test_fun = theano.function(inputs=[input_seq], outputs=[weights, output])
    return P, test_fun
Beispiel #6
0
def build_network(input_size, hidden_size, constraint_adj=False):
    P = Parameters()
    X = T.bmatrix('X')

    P.W_input_hidden = U.initial_weights(input_size, hidden_size)
    P.b_hidden = U.initial_weights(hidden_size)
    P.b_output = U.initial_weights(input_size)
    hidden_lin = T.dot(X, P.W_input_hidden) + P.b_hidden
    hidden = T.nnet.sigmoid(hidden_lin)
    output = T.nnet.softmax(T.dot(hidden, P.W_input_hidden.T) + P.b_output)
    parameters = P.values()
    cost = build_error(X, output, P)
    if constraint_adj: pass
    #cost = cost + adjacency_constraint(hidden_lin)

    return X, output, cost, P
Beispiel #7
0
def make_train_functions():
    P = Parameters()
    X = T.bvector('X')
    Y = T.ivector('Y')
    aux = {}

    predict = model.build(
        P,
        input_size=128,
        embedding_size=64,
        controller_size=256,
        stack_size=256,
        output_size=128,
    )

    output = predict(X, aux=aux)
    error = -T.log(output[T.arange(Y.shape[0]), ((128 + 1 + Y) % (128 + 1))])
    error = error[-(Y.shape[0] / 2):]
    parameters = P.values()
    gradients = T.grad(T.sum(error), wrt=parameters)
    shapes = [p.get_value().shape for p in parameters]
    count = theano.shared(np.float32(0))
    acc_grads = [theano.shared(np.zeros(s, dtype=np.float32)) for s in shapes]

    acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\
                 [ (count,count + np.float32(1)) ]
    acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\
                [ (count,np.int32(0)) ]
    avg_grads = [(g / count) for g in acc_grads]
    avg_grads = [clip(g, 1) for g in acc_grads]

    acc = theano.function(
        inputs=[X, Y],
        outputs=T.mean(error),
        updates=acc_update,
    )
    update = theano.function(
        inputs=[],
        updates=updates.adadelta(parameters, avg_grads, learning_rate=1e-8) +
        acc_clear)

    test = theano.function(
        inputs=[X],
        outputs=T.argmax(output, axis=1)[-(X.shape[0] / 2):],
    )
    return acc, update, test
Beispiel #8
0
def make_model(input_size=8,
               output_size=8,
               mem_size=128,
               mem_width=20,
               hidden_size=100):
    """
	Given the model parameters, return a Theano function for the NTM's model
	"""

    P = Parameters()

    # Build the controller
    ctrl = controller.build(P, input_size, output_size, mem_size, mem_width,
                            hidden_size)
    predict = model.build(P, mem_size, mem_width, hidden_size, ctrl)
    input_seq = T.matrix('input_sequence')
    [M_curr, weights, output] = predict(input_seq)

    # Return a Theano function for the NTM
    test_fun = theano.function(inputs=[input_seq], outputs=[weights, output])
    return P, test_fun
Beispiel #9
0
def create_model(ids, vocab2id, size):
    word_vector_size = size
    hidden_state_size = size

    P = Parameters()
    P.V = create_vocab_vectors(P, vocab2id, word_vector_size)
    P.W_predict = np.zeros(P.V.get_value().shape).T
    P.b_predict = np.zeros((P.V.get_value().shape[0], ))
    X = P.V[ids]

    step = build_lstm_step(P, word_vector_size, hidden_state_size)

    [states, _], _ = theano.scan(step,
                                 sequences=[X],
                                 outputs_info=[P.init_h, P.init_c])

    scores = T.dot(states, P.W_predict) + P.b_predict
    scores = T.nnet.softmax(scores)

    log_likelihood, cross_ent = word_cost(scores[:-1], ids[1:])
    cost = log_likelihood  #+ 1e-4 * sum( T.sum(abs(w)) for w in P.values() )
    obv_cost = cross_ent
    return scores, cost, obv_cost, P
Beispiel #10
0
    acc_size = 0
    for i, size in enumerate(input_sizes):
        P["W_%s_%d" % (name, i)] = weights[acc_size:acc_size + size]
        Ws.append(P["W_%s_%d" % (name, i)])
        acc_size += size
    P["b_%s" % name] = np.zeros((output_size, ), dtype=np.float32)
    b = P["b_%s" % name]

    def transform(Xs):
        acc = 0.
        for X, W in zip(Xs, Ws):
            if X.dtype.startswith('int'):
                acc += W[X]
            else:
                acc += T.dot(X, W)
        output = activation(acc + b)
        output.name = name
        return output

    return transform


if __name__ == "__main__":
    import vae
    P = Parameters()
    inferer = build_classifier(P, "z1_latent", [10, 5], [5, 5, 5, 5], 5)

    print inferer(
        [T.constant(np.arange(5)),
         T.constant(np.eye(5, dtype=np.float32))]).eval()
Beispiel #11
0
        
        B1 = -0.5*(((z2-w1)/0.4)**2) - 0.1 * w4
        B2 = -0.5*(((z2-w1+w3)/0.35)**2) - 0.1 * w4
        B3 = -0.5*(z1**2 + z2**2/5.)
        return lse(lse(B1,B2),B3)
    
    
    from theano_toolkit.parameters import Parameters
    from theano_toolkit import updates
    from pprint import pprint
    floatX = theano.config.floatX


    print 'building model'
    z0 = T.matrix('z0')    
    P = Parameters() 
    iaf, masks = iaf_made_wn(P,L=8,num_units=64,
                             num_hids=1,nonl=T.nnet.elu,
                             cond_bias=False)
    zT, ss = iaf(z0,cond_bias=None)
    parameters = P.values()
    pprint(parameters)
    

    logp = U(zT)
    logq = - ss
    losses = logq - logp
    loss = losses.mean()
    gradients = updates.clip_deltas(T.grad(loss, wrt=parameters), 5)    
    P_train = Parameters()    
    fupdates = updates.adam(parameters, gradients,
Beispiel #12
0
def crossentropy(output, Y):
    if output.owner.op == T.nnet.softmax_op:
        x = output.owner.inputs[0]
        k = T.max(x, axis=1, keepdims=True)
        sum_x = T.log(T.sum(T.exp(x - k), axis=1)) + k
        return -x[T.arange(x.shape[0]), Y] + sum_x
    else:
        return T.nnet.categorical_crossentropy(outputs, Y)


if __name__ == "__main__":
    config.parse_args()
    total_frames = sum(x.shape[0]
                       for x, _ in frame_label_data.training_stream())
    logging.info("Total frames: %d" % total_frames)
    P = Parameters()
    predict = model.build(P)

    X = T.matrix('X')
    Y = T.ivector('Y')
    _, outputs = predict(X)
    cross_entropy = T.mean(crossentropy(outputs, Y))
    parameters = P.values()
    loss = cross_entropy + \
            (0.5/total_frames) * sum(T.sum(T.sqr(w)) for w in parameters)

    gradients = T.grad(loss, wrt=parameters)
    logging.info("Parameters to tune:" +
                 ', '.join(sorted(w.name for w in parameters)))

    update_vars = Parameters()
Beispiel #13
0
    def __init__(self, input_size, output_size, mem_size, mem_width,
                 hidden_sizes, num_heads, max_epochs, momentum, learning_rate,
                 grad_clip, l2_norm):

        self.input_size = input_size
        self.output_size = output_size
        self.mem_size = mem_size
        self.mem_width = mem_width
        self.hidden_sizes = hidden_sizes
        self.num_heads = num_heads
        self.max_epochs = max_epochs
        self.momentum = momentum
        self.learning_rate = learning_rate
        self.grad_clip = grad_clip
        self.l2_norm = l2_norm

        self.best_train_cost = np.inf
        self.best_valid_cost = np.inf
        #self.train = None
        #self.cost = None

        self.train_his = []

        P = Parameters()
        ctrl = controller.build(P, self.input_size, self.output_size,
                                self.mem_size, self.mem_width,
                                self.hidden_sizes)
        predict = model.build(P, self.mem_size, self.mem_width,
                              self.hidden_sizes[-1], ctrl, self.num_heads)

        input_seq = T.matrix('input_sequence')
        output_seq = T.matrix('output_sequence')

        [M_curr, weights, output] = predict(input_seq)
        # output_seq_pred = seqs[-1]

        cross_entropy = T.sum(T.nnet.binary_crossentropy(
            5e-6 + (1 - 2 * 5e-6) * output, output_seq),
                              axis=1)

        self.params = P.values()

        l2 = T.sum(0)
        for p in self.params:
            l2 = l2 + (p**2).sum()

        cost = T.sum(cross_entropy) + self.l2_norm * l2
        #     cost = T.sum(cross_entropy) + 1e-3*l2

        grads = [
            T.clip(g, grad_clip[0], grad_clip[1])
            for g in T.grad(cost, wrt=self.params)
        ]
        #     grads  = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ]
        #     grads  = [ T.clip(g,1e-9, 0.2) for g in T.grad(cost,wrt=params) ]

        self.train = theano.function(
            inputs=[input_seq, output_seq],
            outputs=cost,
            #             updates=updates.adadelta(params,grads)
            updates=updates.rmsprop(self.params,
                                    grads,
                                    momentum=self.momentum,
                                    learning_rate=self.learning_rate))

        self.predict_cost = theano.function(inputs=[input_seq, output_seq],
                                            outputs=cost)

        self.predict = theano.function(inputs=[input_seq],
                                       outputs=[weights, output])
    ]


def weight_norm(u, norm=1.9356):
    in_norm = T.sqrt(T.sum(T.sqr(u), axis=0))
    ratio = T.minimum(norm, in_norm) / (in_norm + 1e-8)
    return ratio * u


def normalise_weights(updates):
    return [(p, weight_norm(u) if p.name.startswith('W') else u)
            for p, u in updates]


if __name__ == "__main__":
    P = Parameters()
    extract, _ = model.build(P, "vrnn")
    X = T.tensor3('X')
    l = T.ivector('l')
    [Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std] = extract(X, l)

    parameters = P.values()
    batch_cost = model.cost(X, Z_prior_mean, Z_prior_std, Z_mean, Z_std,
                            X_mean, X_std, l)
    print "Calculating gradient..."
    print parameters
    batch_size = T.cast(X.shape[1], 'float32')

    gradients = T.grad(batch_cost, wrt=parameters)
    gradients = [g / batch_size for g in gradients]
    gradients = clip(5, parameters, gradients)