Example #1
0
def make_train(input_size, output_size, mem_size, mem_width, hidden_size=100):
    P = Parameters()

    # Build controller. ctrl is a network that takes an external and read input
    # and returns the output of the network and its hidden layer
    ctrl = controller.build(P, input_size, output_size, mem_size, mem_width,
                            hidden_size)

    # Build model that predicts output sequence given input sequence
    predict = model.build(P, mem_size, mem_width, hidden_size, ctrl)

    input_seq = T.matrix('input_sequence')
    output_seq = T.matrix('output_sequence')
    [M, weights, output_seq_pred] = predict(input_seq)

    # Setup for adadelta updates
    cross_entropy = T.sum(T.nnet.binary_crossentropy(
        5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq),
                          axis=1)
    params = P.values()
    l2 = T.sum(0)
    for p in params:
        l2 = l2 + (p**2).sum()
    cost = T.sum(cross_entropy) + 1e-3 * l2
    # clip gradients
    grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)]

    train = theano.function(inputs=[input_seq, output_seq],
                            outputs=cost,
                            updates=updates.adadelta(params, grads))

    return P, train
Example #2
0
def make_train(input_size,
               output_size,
               mem_size,
               mem_width,
               hidden_sizes=[100]):
    P = Parameters()
    ctrl = controller.build(P, input_size, output_size, mem_size, mem_width,
                            hidden_sizes)
    predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl)

    input_seq = T.matrix('input_sequence')
    output_seq = T.matrix('output_sequence')
    seqs = predict(input_seq)
    output_seq_pred = seqs[-1]
    cross_entropy = T.sum(T.nnet.binary_crossentropy(
        5e-6 + (1 - 2 * 5e-6) * output_seq_pred, output_seq),
                          axis=1)
    cost = T.sum(cross_entropy)  # + 1e-3 * l2
    params = P.values()
    grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)]

    train = theano.function(inputs=[input_seq, output_seq],
                            outputs=T.sum(cross_entropy),
                            updates=updates.adadelta(params, grads))

    return P, train
def create_model(ids,vocab2id,size):
	word_vector_size  = size
	hidden_state_size = size
	
	P = Parameters()
	P.V = create_vocab_vectors(P,vocab2id,word_vector_size)
	P.W_predict = np.zeros(P.V.get_value().shape).T
	P.b_predict = np.zeros((P.V.get_value().shape[0],))
	X = P.V[ids]

	step = build_lstm_step(P,word_vector_size,hidden_state_size)

	[states,_],_ = theano.scan(
			step,
			sequences    = [X],
			outputs_info = [P.init_h,P.init_c]
		)

	scores = T.dot(states,P.W_predict) + P.b_predict
	scores = T.nnet.softmax(scores)

	log_likelihood, cross_ent = word_cost(scores[:-1],ids[1:])
	cost = log_likelihood #+ 1e-4 * sum( T.sum(abs(w)) for w in P.values() )
	obv_cost = cross_ent
	return scores, cost, obv_cost, P
def make_train(input_size,output_size,mem_size,mem_width,hidden_sizes=[100]):
	P = Parameters()
	ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_sizes)
	predict = model.build(P,mem_size,mem_width,hidden_sizes[-1],ctrl)
	
	input_seq = T.matrix('input_sequence')
	output_seq = T.matrix('output_sequence')
	seqs = predict(input_seq)
	output_seq_pred = seqs[-1]
	cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1)
	params = P.values()
	l2 = T.sum(0)
	for p in params:
		l2 = l2 + (p ** 2).sum()
	cost = T.sum(cross_entropy) + 1e-4*l2
	grads  = [ T.clip(g,-10,10) for g in T.grad(cost,wrt=params) ]
	
	train = theano.function(
			inputs=[input_seq,output_seq],
			outputs=cost,
			# updates=updates.adadelta(params,grads)
			updates = updates.rmsprop(params,grads,learning_rate = 1e-5)
		)

	return P,train
Example #5
0
def make_train(input_size,output_size,mem_size,mem_width,hidden_size=100):
	P = Parameters()

        # Build controller. ctrl is a network that takes an external and read input
        # and returns the output of the network and its hidden layer
	ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_size)

        # Build model that predicts output sequence given input sequence
	predict = model.build(P,mem_size,mem_width,hidden_size,ctrl)

	input_seq = T.matrix('input_sequence')
	output_seq = T.matrix('output_sequence')
        [M,weights,output_seq_pred] = predict(input_seq)

        # Setup for adadelta updates
	cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1)
	params = P.values()
	l2 = T.sum(0)
	for p in params:
		l2 = l2 + (p ** 2).sum()
	cost = T.sum(cross_entropy) + 1e-3*l2
        # clip gradients
	grads  = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ]

	train = theano.function(
			inputs=[input_seq,output_seq],
			outputs=cost,
			updates=updates.adadelta(params,grads)
		)

	return P,train
Example #6
0
def prepare_functions(input_size, hidden_size, latent_size, step_count,
                      batch_size, train_X, valid_X):
    P = Parameters()
    encode_decode = model.build(P,
                                input_size=input_size,
                                hidden_size=hidden_size,
                                latent_size=latent_size)
    P.W_decoder_input_0.set_value(P.W_decoder_input_0.get_value() * 10)

    X = T.matrix('X')
    step_count = 10
    parameters = P.values()

    cost_symbs = []
    for s in xrange(step_count):
        Z_means, Z_stds, alphas, \
            X_mean, log_pi_samples = encode_decode(X, step_count=s + 1)
        batch_recon_loss, log_p = model.recon_loss(X, X_mean, log_pi_samples)
        recon_loss = T.mean(batch_recon_loss, axis=0)
        reg_loss = T.mean(model.reg_loss(Z_means, Z_stds, alphas), axis=0)
        vlb = recon_loss + reg_loss
        corr = T.mean(T.eq(T.argmax(log_p, axis=0),
                           T.argmax(log_pi_samples, axis=0)),
                      axis=0)
        cost = cost_symbs.append(vlb)

    avg_cost = sum(cost_symbs) / step_count
    cost = avg_cost + 1e-3 * sum(T.sum(T.sqr(w)) for w in parameters)

    gradients = updates.clip_deltas(T.grad(cost, wrt=parameters), 5)

    print "Updated parameters:"
    pprint(parameters)
    idx = T.iscalar('idx')

    train = theano.function(
        inputs=[idx],
        outputs=[
            vlb, recon_loss, reg_loss,
            T.max(T.argmax(log_pi_samples, axis=0)), corr
        ],
        updates=updates.adam(parameters, gradients, learning_rate=1e-4),
        givens={X: train_X[idx * batch_size:(idx + 1) * batch_size]})

    validate = theano.function(inputs=[], outputs=vlb, givens={X: valid_X})

    sample = theano.function(inputs=[],
                             outputs=[
                                 X, X_mean,
                                 T.argmax(log_pi_samples, axis=0),
                                 T.exp(log_pi_samples)
                             ],
                             givens={X: valid_X[:10]})

    return train, validate, sample
def make_functions(
        input_size, output_size, mem_size, mem_width, hidden_sizes=[100]):

    start_time = time.time()

    input_seqs  = T.btensor3('input_sequences')
    output_seqs = T.btensor3('output_sequences')

    P = Parameters()
    process = model.build(P,
            input_size, output_size, mem_size, mem_width, hidden_sizes[0])
    outputs = process(T.cast(input_seqs,'float32'))
    output_length = (input_seqs.shape[1] - 2) // 2

    Y = output_seqs[:,-output_length:,:-2]
    Y_hat = T.nnet.sigmoid(outputs[:,-output_length:,:-2])

    cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat,Y))
    bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2)

    params = P.values()

    cost = cross_entropy # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params)

    print "Computing gradients",
    grads = T.grad(cost, wrt=params)
    grads = updates.clip_deltas(grads, np.float32(clip_length))

    print "Done. (%0.3f s)"%(time.time() - start_time)
    start_time = time.time()
    print "Compiling function",
    P_learn = Parameters()

    update_pairs = updates.rmsprop(
                params, grads,
                learning_rate=1e-4,
                P=P_learn
            )

    train = theano.function(
            inputs=[input_seqs, output_seqs],
            outputs=cross_entropy,
            updates=update_pairs,
        )

    test = theano.function(
            inputs=[input_seqs, output_seqs],
            outputs=bits_loss
        )

    print "Done. (%0.3f s)"%(time.time() - start_time)
    print P.parameter_count()
    return P, P_learn, train, test
Example #8
0
def make_train_functions():
    P = Parameters()
    X = T.bvector('X')
    Y = T.ivector('Y')
    aux = {}

    predict = model.build(
        P,
        input_size=128,
        embedding_size=64,
        controller_size=256,
        stack_size=256,
        output_size=128,
    )

    output = predict(X,aux=aux)
    error = - T.log(output[T.arange(Y.shape[0]),((128+1 + Y)%(128+1))])
    error = error[-(Y.shape[0]/2):]
    parameters = P.values()
    gradients = T.grad(T.sum(error),wrt=parameters)
    shapes = [ p.get_value().shape for p in parameters ]
    count = theano.shared(np.float32(0))
    acc_grads  = [
        theano.shared(np.zeros(s,dtype=np.float32))
        for s in shapes
    ]

    acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\
                 [ (count,count + np.float32(1)) ]
    acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\
                [ (count,np.int32(0)) ]
    avg_grads = [ (g / count) for g in acc_grads ]
    avg_grads = [ clip(g,1) for g in acc_grads ]


    acc = theano.function(
            inputs=[X,Y],
            outputs=T.mean(error),
            updates = acc_update,
        )
    update = theano.function(
            inputs=[],
            updates=updates.adadelta(parameters,avg_grads,learning_rate=1e-8) + acc_clear
        )

    test = theano.function(
            inputs=[X],
            outputs=T.argmax(output,axis=1)[-(X.shape[0]/2):],
        )
    return acc,update,test
Example #9
0
def build_network(input_size,hidden_size,constraint_adj=False):
	P = Parameters()
	X = T.bmatrix('X')
	
	P.W_input_hidden = U.initial_weights(input_size,hidden_size)
	P.b_hidden       = U.initial_weights(hidden_size)
	P.b_output       = U.initial_weights(input_size)
	hidden_lin = T.dot(X,P.W_input_hidden)+P.b_hidden
	hidden = T.nnet.sigmoid(hidden_lin)
	output = T.nnet.softmax(T.dot(hidden,P.W_input_hidden.T) + P.b_output)
	parameters = P.values() 
	cost = build_error(X,output,P) 
	if constraint_adj:pass
		#cost = cost + adjacency_constraint(hidden_lin)

	return X,output,cost,P
Example #10
0
def make_train_functions():
    P = Parameters()
    X = T.bvector('X')
    Y = T.ivector('Y')
    aux = {}

    predict = model.build(
        P,
        input_size=128,
        embedding_size=64,
        controller_size=256,
        stack_size=256,
        output_size=128,
    )

    output = predict(X, aux=aux)
    error = -T.log(output[T.arange(Y.shape[0]), ((128 + 1 + Y) % (128 + 1))])
    error = error[-(Y.shape[0] / 2):]
    parameters = P.values()
    gradients = T.grad(T.sum(error), wrt=parameters)
    shapes = [p.get_value().shape for p in parameters]
    count = theano.shared(np.float32(0))
    acc_grads = [theano.shared(np.zeros(s, dtype=np.float32)) for s in shapes]

    acc_update = [ (a,a+g) for a,g in zip(acc_grads,gradients) ] +\
                 [ (count,count + np.float32(1)) ]
    acc_clear = [ (a,np.float32(0) * a) for a in acc_grads ] +\
                [ (count,np.int32(0)) ]
    avg_grads = [(g / count) for g in acc_grads]
    avg_grads = [clip(g, 1) for g in acc_grads]

    acc = theano.function(
        inputs=[X, Y],
        outputs=T.mean(error),
        updates=acc_update,
    )
    update = theano.function(
        inputs=[],
        updates=updates.adadelta(parameters, avg_grads, learning_rate=1e-8) +
        acc_clear)

    test = theano.function(
        inputs=[X],
        outputs=T.argmax(output, axis=1)[-(X.shape[0] / 2):],
    )
    return acc, update, test
def make_train(input_size,output_size,mem_size,mem_width,hidden_sizes=[100]):
	P = Parameters()
	ctrl = controller.build(P,input_size,output_size,mem_size,mem_width,hidden_sizes)
	predict = model.build(P,mem_size,mem_width,hidden_sizes[-1],ctrl)
	
	input_seq = T.matrix('input_sequence')
	output_seq = T.matrix('output_sequence')
	seqs = predict(input_seq)
	output_seq_pred = seqs[-1]
	cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output_seq_pred,output_seq),axis=1)
	cost = T.sum(cross_entropy) # + 1e-3 * l2
	params = P.values()
	grads  = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ]

	response_length = input_seq.shape[0]/2
	train = theano.function(
			inputs=[input_seq,output_seq],
			outputs=T.mean(cross_entropy[-response_length:]),
			updates=updates.adadelta(params,grads)
		)

	return P,train
Example #12
0
def create_model(ids, vocab2id, size):
    word_vector_size = size
    hidden_state_size = size

    P = Parameters()
    P.V = create_vocab_vectors(P, vocab2id, word_vector_size)
    P.W_predict = np.zeros(P.V.get_value().shape).T
    P.b_predict = np.zeros((P.V.get_value().shape[0], ))
    X = P.V[ids]

    step = build_lstm_step(P, word_vector_size, hidden_state_size)

    [states, _], _ = theano.scan(step,
                                 sequences=[X],
                                 outputs_info=[P.init_h, P.init_c])

    scores = T.dot(states, P.W_predict) + P.b_predict
    scores = T.nnet.softmax(scores)

    log_likelihood, cross_ent = word_cost(scores[:-1], ids[1:])
    cost = log_likelihood  #+ 1e-4 * sum( T.sum(abs(w)) for w in P.values() )
    obv_cost = cross_ent
    return scores, cost, obv_cost, P
Example #13
0
def make_model(input_size=8,
               output_size=8,
               mem_size=128,
               mem_width=20,
               hidden_sizes=[100]):
    P = Parameters()
    ctrl = controller.build(P, input_size, output_size, mem_size, mem_width,
                            hidden_sizes)
    predict = model.build(P, mem_size, mem_width, hidden_sizes[-1], ctrl)
    input_seq = T.matrix('input_sequence')
    [M_curr, weights, output] = predict(input_seq)

    test_fun = theano.function(inputs=[input_seq], outputs=[weights, output])
    return P, test_fun
Example #14
0
	def __init__(self, hidden_size, input_size, vocab_size, entropy_reg = 0.001, key_entropy_reg = 0.001, stack_size=1, celltype=LSTM):

		# core layer in RNN/LSTM
		self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size)

		# add an embedding
		self.model.layers.insert(0, Embedding(vocab_size, input_size))

		# add a classifier:
		self.model.layers.append(Layer(hidden_size, vocab_size, activation = softmax))

		self.entropy_reg     = entropy_reg
		self.key_entropy_reg = key_entropy_reg

		self.turing_params = Parameters()
		#init turing machine model
		self.turing_updates , self.turing_predict = turing_model.build(self.turing_params , hidden_size , vocab_size)
		self.hidden_size = hidden_size         
		# inputs are matrices of indices,
		# each row is a sentence, each column a timestep
		self._stop_word   = theano.shared(np.int32(999999999), name="stop word")
		self.for_how_long = T.ivector()
		self.mask_matrix = T.imatrix()
		self.input_mat = T.imatrix()
		self.priming_word = T.iscalar()
		self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024))

		# create symbolic variables for prediction:
		#change by darong #issue : what is greedy
		self.lstm_predictions = self.create_lstm_prediction()
		self.final_predictions,self.entropy,self.key_entropy = self.create_final_prediction()

		# create symbolic variable for greedy search:
		self.greedy_predictions = self.create_lstm_prediction(greedy=True)

		# create gradient training functions:
		self.create_cost_fun()#create 2 cost func(lstm final)

		self.lstm_lr = 0.01
		self.turing_lr = 0.01
		self.all_lr = 0.01
		self.create_training_function()#create 3 functions(lstm turing all)
		self.create_predict_function()#create 2 predictions(lstm final)

		# create ppl
		self.lstm_ppl = self.create_lstm_ppl()
		self.final_ppl = self.create_final_ppl()
		self.create_ppl_function()
Example #15
0
def make_functions(input_size,
                   output_size,
                   mem_size,
                   mem_width,
                   hidden_sizes=[100]):

    start_time = time.time()

    input_seqs = T.btensor3('input_sequences')
    output_seqs = T.btensor3('output_sequences')

    P = Parameters()
    process = model.build(P, input_size, output_size, mem_size, mem_width,
                          hidden_sizes[0])
    outputs = process(T.cast(input_seqs, 'float32'))
    output_length = (input_seqs.shape[1] - 2) // 2

    Y = output_seqs[:, -output_length:, :-2]
    Y_hat = T.nnet.sigmoid(outputs[:, -output_length:, :-2])

    cross_entropy = T.mean(T.nnet.binary_crossentropy(Y_hat, Y))
    bits_loss = cross_entropy * (Y.shape[1] * Y.shape[2]) / T.log(2)

    params = P.values()

    cost = cross_entropy  # + 1e-5 * sum(T.sum(T.sqr(w)) for w in params)

    print "Computing gradients",
    grads = T.grad(cost, wrt=params)
    grads = updates.clip_deltas(grads, np.float32(clip_length))

    print "Done. (%0.3f s)" % (time.time() - start_time)
    start_time = time.time()
    print "Compiling function",
    P_learn = Parameters()

    update_pairs = updates.rmsprop(params,
                                   grads,
                                   learning_rate=1e-4,
                                   P=P_learn)

    train = theano.function(
        inputs=[input_seqs, output_seqs],
        outputs=cross_entropy,
        updates=update_pairs,
    )

    test = theano.function(inputs=[input_seqs, output_seqs], outputs=bits_loss)

    print "Done. (%0.3f s)" % (time.time() - start_time)
    print P.parameter_count()
    return P, P_learn, train, test
Example #16
0
def make_model(input_size=8,
               output_size=8,
               mem_size=128,
               mem_width=20,
               hidden_size=100):
    """
	Given the model parameters, return a Theano function for the NTM's model
	"""

    P = Parameters()

    # Build the controller
    ctrl = controller.build(P, input_size, output_size, mem_size, mem_width,
                            hidden_size)
    predict = model.build(P, mem_size, mem_width, hidden_size, ctrl)
    input_seq = T.matrix('input_sequence')
    [M_curr, weights, output] = predict(input_seq)

    # Return a Theano function for the NTM
    test_fun = theano.function(inputs=[input_seq], outputs=[weights, output])
    return P, test_fun
Example #17
0
def build_network(input_size, hidden_size, constraint_adj=False):
    P = Parameters()
    X = T.bmatrix('X')

    P.W_input_hidden = U.initial_weights(input_size, hidden_size)
    P.b_hidden = U.initial_weights(hidden_size)
    P.b_output = U.initial_weights(input_size)
    hidden_lin = T.dot(X, P.W_input_hidden) + P.b_hidden
    hidden = T.nnet.sigmoid(hidden_lin)
    output = T.nnet.softmax(T.dot(hidden, P.W_input_hidden.T) + P.b_output)
    parameters = P.values()
    cost = build_error(X, output, P)
    if constraint_adj: pass
    #cost = cost + adjacency_constraint(hidden_lin)

    return X, output, cost, P
Example #18
0
    acc_size = 0
    for i, size in enumerate(input_sizes):
        P["W_%s_%d" % (name, i)] = weights[acc_size:acc_size + size]
        Ws.append(P["W_%s_%d" % (name, i)])
        acc_size += size
    P["b_%s" % name] = np.zeros((output_size, ), dtype=np.float32)
    b = P["b_%s" % name]

    def transform(Xs):
        acc = 0.
        for X, W in zip(Xs, Ws):
            if X.dtype.startswith('int'):
                acc += W[X]
            else:
                acc += T.dot(X, W)
        output = activation(acc + b)
        output.name = name
        return output

    return transform


if __name__ == "__main__":
    import vae
    P = Parameters()
    inferer = build_classifier(P, "z1_latent", [10, 5], [5, 5, 5, 5], 5)

    print inferer(
        [T.constant(np.arange(5)),
         T.constant(np.eye(5, dtype=np.float32))]).eval()
Example #19
0
    P.b_output = np.zeros((output_size, ))

    def model(X):
        hidden = lstm_layer(X)[1]
        return T.nnet.softmax(T.dot(hidden, P.W_output) + P.b_output)

    return model


def label_seq(string):
    idxs = font.indexify(string)
    return idxs


if __name__ == "__main__":
    P = Parameters()
    X = T.matrix('X')
    Y = T.ivector('Y')

    predict = build_model(P, 8, 512, len(font.chars) + 1)

    probs = predict(X)
    alpha = 0.5
    params = P.values()
    cost = ctc.cost(probs, Y)  #+ 1e-8 * sum(T.sum(T.sqr(w)) for w in params)
    gradients = T.grad(cost, wrt=params)

    gradient_acc = [theano.shared(0 * p.get_value()) for p in params]
    counter = theano.shared(np.float32(0.))
    acc = theano.function(inputs=[X, Y],
                          outputs=cost,
Example #20
0
		)
	return acc,update

if __name__ == "__main__":
	training_file = sys.argv[1]
	compute_tree_exists = False

	vocab_in = vocab.load("qa2.pkl")
	vocab_size = len(vocab_in)
	print "Vocab size is:", vocab_size
	evidence_count = 2
	if compute_tree_exists:
		inputs,outputs,params,grads = pickle.load(open("compute_tree.pkl"))
	else:
		print "Creating compute tree...",
		P = Parameters()
		story = T.ivector('story')
		idxs  = T.ivector('idxs')
		qstn  = T.ivector('qstn')
		ans_evds = T.ivector('ans_evds')
		ans_lbl = T.iscalar('ans_lbl')

		attention = model.build(P,
				word_rep_size = 128,
				stmt_hidden_size = 128,
				diag_hidden_size = 128,
				vocab_size  = vocab_size,
				output_size = vocab_size,
				map_fun_size = 128,
				evidence_count = evidence_count
				)
Example #21
0
    return acc, update


if __name__ == "__main__":
    training_file = sys.argv[1]
    compute_tree_exists = False

    vocab_in = vocab.load("qa2.pkl")
    vocab_size = len(vocab_in)
    print "Vocab size is:", vocab_size
    evidence_count = 2
    if compute_tree_exists:
        inputs, outputs, params, grads = pickle.load(open("compute_tree.pkl"))
    else:
        print "Creating compute tree...",
        P = Parameters()
        story = T.ivector('story')
        idxs = T.ivector('idxs')
        qstn = T.ivector('qstn')
        ans_evds = T.ivector('ans_evds')
        ans_lbl = T.iscalar('ans_lbl')

        attention = model.build(P,
                                word_rep_size=128,
                                stmt_hidden_size=128,
                                diag_hidden_size=128,
                                vocab_size=vocab_size,
                                output_size=vocab_size,
                                map_fun_size=128,
                                evidence_count=evidence_count)
Example #22
0
    predict = T.nnet.softmax(T.dot(hidden, W_hidden_output) + b_output)

    return X, predict


def label_seq(string):
    idxs = font.indexify(string)
    result = np.ones((len(idxs) * 2 + 1,), dtype=np.int32) * -1
    result[np.arange(len(idxs)) * 2 + 1] = idxs
    print result
    return result


if __name__ == "__main__":
    P = Parameters()
    X = T.matrix('X')
    Y = T.ivector('Y')
    X, predict = build_model(P, X, 10, 10, 10)

    cost = ctc.cost(predict, Y)
    params = P.values()
    grad = T.grad(cost, wrt=params)
    train = theano.function(
        inputs=[X, Y],
        outputs=cost,
        updates=updates.adadelta(params, grad)
    )

    for _ in xrange(10):
        print train(np.eye(10, dtype=np.float32)[::-1], np.arange(10, dtype=np.int32))
Example #23
0
    P.W_output = np.zeros((hidden_size,output_size))
    P.b_output = np.zeros((output_size,))

    def model(X):
        hidden = lstm_layer(X)[1]
        return T.nnet.softmax(T.dot(hidden,P.W_output) + P.b_output)
    return model


def label_seq(string):
    idxs = font.indexify(string)
    return idxs 


if __name__ == "__main__":
    P = Parameters()
    X = T.matrix('X')
    Y = T.ivector('Y')

    predict = build_model(P,8,512,len(font.chars)+1)


    probs = predict(X)
    alpha = 0.5
    params = P.values()
    cost = ctc.cost(probs, Y) #+ 1e-8 * sum(T.sum(T.sqr(w)) for w in params)
    gradients = T.grad(cost, wrt=params)

    gradient_acc = [ theano.shared(0 * p.get_value()) for p in params ]
    counter = theano.shared(np.float32(0.))
    acc = theano.function(
Example #24
0
        
        B1 = -0.5*(((z2-w1)/0.4)**2) - 0.1 * w4
        B2 = -0.5*(((z2-w1+w3)/0.35)**2) - 0.1 * w4
        B3 = -0.5*(z1**2 + z2**2/5.)
        return lse(lse(B1,B2),B3)
    
    
    from theano_toolkit.parameters import Parameters
    from theano_toolkit import updates
    from pprint import pprint
    floatX = theano.config.floatX


    print 'building model'
    z0 = T.matrix('z0')    
    P = Parameters() 
    iaf, masks = iaf_made_wn(P,L=8,num_units=64,
                             num_hids=1,nonl=T.nnet.elu,
                             cond_bias=False)
    zT, ss = iaf(z0,cond_bias=None)
    parameters = P.values()
    pprint(parameters)
    

    logp = U(zT)
    logq = - ss
    losses = logq - logp
    loss = losses.mean()
    gradients = updates.clip_deltas(T.grad(loss, wrt=parameters), 5)    
    P_train = Parameters()    
    fupdates = updates.adam(parameters, gradients,
Example #25
0
if __name__ == "__main__":
    batch_size = 256
    validation = 0.1

    all_X, all_W, all_Y = data.load('data/training.csv')
    validation_count = int(math.ceil(all_X.shape[0] * validation))
    train_X, train_W, train_Y = (all_X[:-validation_count],
                                 all_W[:-validation_count],
                                 all_Y[:-validation_count])

    valid_X, valid_W, valid_Y = (all_X[-validation_count:],
                                 all_W[-validation_count:],
                                 all_Y[-validation_count:])

    P = Parameters()
    data_X = theano.shared(train_X)
    data_W = theano.shared(train_W)
    data_Y = theano.shared(train_Y)

    train, test = get_train_test_fn(P, data_X, data_W, data_Y)
    batches = int(math.ceil(train_X.shape[0] / float(batch_size)))
    best_score = -np.inf
    for epoch in xrange(20):
        for i in xrange(batches):
            train(i, batch_size)
        scores = test(valid_X, valid_W, valid_Y)
        print scores,
        if scores[0] > best_score :
            P.save('model.pkl')
            best_score = scores[0]
Example #26
0
import theano.tensor as T
import numpy as np
from theano_toolkit import utils as U
from theano_toolkit import hinton
from theano_toolkit import updates
from theano_toolkit.parameters import Parameters

import ctc
import font
import lstm
from ocr import *

if __name__ == "__main__":
    import sys
    test_word = sys.argv[1]

    P = Parameters()
    X = T.matrix('X')

    predict = build_model(P,8,512,len(font.chars)+1)
    probs = predict(X)
    test = theano.function(inputs=[X],outputs=probs)
    P.load('model.pkl')
    image = font.imagify(test_word)
    hinton.plot(image.astype(np.float32).T[::-1])
    y_seq = label_seq(test_word)
    probs = test(image)
    print " ", ' '.join(font.chars[i] if i < len(font.chars) else "_" for i in np.argmax(probs,axis=1))
    hinton.plot(probs[:,y_seq].T,max_arr=1.)

Example #27
0
        forget_gate = T.nnet.sigmoid(forget_lin)
        cell_updates = T.tanh(cell_lin)

        cell = forget_gate * prev_cell + in_gate * cell_updates

        out_lin = x_o + h_o + b_o + T.dot(cell, V_o)
        out_gate = T.nnet.sigmoid(out_lin)

        hid = out_gate * T.tanh(cell)
        return cell, hid

    return step


if __name__ == "__main__":
    P = Parameters()
    X = T.ivector('X')
    P.V = np.zeros((8, 8), dtype=np.int32)

    X_rep = P.V[X]
    P.W_output = np.zeros((15, 8), dtype=np.int32)
    lstm_layer = build(P, name="test", input_size=8, hidden_size=15)

    _, hidden = lstm_layer(X_rep)
    output = T.nnet.softmax(T.dot(hidden, P.W_output))
    delay = 5
    label = X[:-delay]
    predicted = output[delay:]

    cost = -T.sum(T.log(predicted[T.arange(predicted.shape[0]), label]))
    params = P.values()
Example #28
0
		in_gate      = T.nnet.sigmoid(in_lin)
		forget_gate  = T.nnet.sigmoid(forget_lin)
		cell_updates = T.tanh(cell_lin)

		cell = forget_gate * prev_cell + in_gate * cell_updates

		out_lin = x_o + h_o + b_o + T.dot(cell,V_o)
		out_gate = T.nnet.sigmoid(out_lin)

		hid = out_gate * T.tanh(cell)
		return cell,hid
	return step


if __name__ == "__main__":
	P = Parameters()
	X = T.ivector('X')
	P.V = np.zeros((8,8),dtype=np.int32)

	X_rep = P.V[X]
	P.W_output = np.zeros((15,8),dtype=np.int32)
	lstm_layer = build(P,
			name = "test",
			input_size = 8,
			hidden_size =15 
		)

	_,hidden = lstm_layer(X_rep)
	output = T.nnet.softmax(T.dot(hidden,P.W_output))
	delay = 5
	label = X[:-delay]
Example #29
0
File: ntm.py Project: c3h3/pyntm
    def __init__(self, 
                 input_size, output_size, mem_size, mem_width, hidden_sizes, num_heads,
                 max_epochs, momentum, learning_rate ,grad_clip, l2_norm):
        
        self.input_size = input_size
        self.output_size = output_size
        self.mem_size = mem_size
        self.mem_width = mem_width
        self.hidden_sizes = hidden_sizes
        self.num_heads = num_heads
        self.max_epochs = max_epochs
        self.momentum = momentum
        self.learning_rate = learning_rate
        self.grad_clip = grad_clip
        self.l2_norm = l2_norm
        
        self.best_train_cost = np.inf
        self.best_valid_cost = np.inf
        #self.train = None
        #self.cost = None
        
        self.train_his = []
        
        P = Parameters()
        ctrl = controller.build( P, self.input_size, self.output_size, self.mem_size, self.mem_width, self.hidden_sizes)
        predict = model.build( P, self.mem_size, self.mem_width, self.hidden_sizes[-1], ctrl, self.num_heads)

        input_seq = T.matrix('input_sequence')
        output_seq = T.matrix('output_sequence')
        
        [M_curr,weights,output] = predict(input_seq)
        # output_seq_pred = seqs[-1]
        
        cross_entropy = T.sum(T.nnet.binary_crossentropy(5e-6 + (1 - 2*5e-6)*output, output_seq),axis=1)
        
        self.params = P.values()
        
        l2 = T.sum(0)
        for p in self.params:
            l2 = l2 + (p ** 2).sum()
            
        cost = T.sum(cross_entropy) + self.l2_norm * l2
    #     cost = T.sum(cross_entropy) + 1e-3*l2
        
        grads  = [ T.clip(g, grad_clip[0], grad_clip[1]) for g in T.grad(cost, wrt=self.params) ]
    #     grads  = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ]
    #     grads  = [ T.clip(g,1e-9, 0.2) for g in T.grad(cost,wrt=params) ]

        self.train = theano.function(
                inputs=[input_seq,output_seq],
                outputs=cost,
    #             updates=updates.adadelta(params,grads)
                updates = updates.rmsprop(self.params, grads, momentum=self.momentum, learning_rate=self.learning_rate )
            )
        
        self.predict_cost = theano.function(
            inputs=[input_seq,output_seq],
            outputs= cost
        )
        
        self.predict = theano.function(
            inputs=[input_seq],
            outputs= [ weights, output]
        )
Example #30
0
    def __init__(self, input_size, output_size, mem_size, mem_width,
                 hidden_sizes, num_heads, max_epochs, momentum, learning_rate,
                 grad_clip, l2_norm):

        self.input_size = input_size
        self.output_size = output_size
        self.mem_size = mem_size
        self.mem_width = mem_width
        self.hidden_sizes = hidden_sizes
        self.num_heads = num_heads
        self.max_epochs = max_epochs
        self.momentum = momentum
        self.learning_rate = learning_rate
        self.grad_clip = grad_clip
        self.l2_norm = l2_norm

        self.best_train_cost = np.inf
        self.best_valid_cost = np.inf
        #self.train = None
        #self.cost = None

        self.train_his = []

        P = Parameters()
        ctrl = controller.build(P, self.input_size, self.output_size,
                                self.mem_size, self.mem_width,
                                self.hidden_sizes)
        predict = model.build(P, self.mem_size, self.mem_width,
                              self.hidden_sizes[-1], ctrl, self.num_heads)

        input_seq = T.matrix('input_sequence')
        output_seq = T.matrix('output_sequence')

        [M_curr, weights, output] = predict(input_seq)
        # output_seq_pred = seqs[-1]

        cross_entropy = T.sum(T.nnet.binary_crossentropy(
            5e-6 + (1 - 2 * 5e-6) * output, output_seq),
                              axis=1)

        self.params = P.values()

        l2 = T.sum(0)
        for p in self.params:
            l2 = l2 + (p**2).sum()

        cost = T.sum(cross_entropy) + self.l2_norm * l2
        #     cost = T.sum(cross_entropy) + 1e-3*l2

        grads = [
            T.clip(g, grad_clip[0], grad_clip[1])
            for g in T.grad(cost, wrt=self.params)
        ]
        #     grads  = [ T.clip(g,-100,100) for g in T.grad(cost,wrt=params) ]
        #     grads  = [ T.clip(g,1e-9, 0.2) for g in T.grad(cost,wrt=params) ]

        self.train = theano.function(
            inputs=[input_seq, output_seq],
            outputs=cost,
            #             updates=updates.adadelta(params,grads)
            updates=updates.rmsprop(self.params,
                                    grads,
                                    momentum=self.momentum,
                                    learning_rate=self.learning_rate))

        self.predict_cost = theano.function(inputs=[input_seq, output_seq],
                                            outputs=cost)

        self.predict = theano.function(inputs=[input_seq],
                                       outputs=[weights, output])
Example #31
0
def crossentropy(output,Y):
    if output.owner.op == T.nnet.softmax_op:
        x = output.owner.inputs[0]
        k = T.max(x,axis=1,keepdims=True)
        sum_x = T.log(T.sum(T.exp(x - k),axis=1)) + k
        return - x[T.arange(x.shape[0]),Y] + sum_x
    else:
        return T.nnet.categorical_crossentropy(outputs,Y)



if __name__ == "__main__":
    config.parse_args()
    total_frames = sum(x.shape[0] for x,_ in frame_label_data.training_stream())
    logging.info("Total frames: %d"%total_frames)
    P = Parameters()
    predict = model.build(P)

    X = T.matrix('X')
    Y = T.ivector('Y')
    _,outputs = predict(X)
    cross_entropy = T.mean(crossentropy(outputs,Y))
    parameters = P.values() 
    loss = cross_entropy + \
            (0.5/total_frames) * sum(T.sum(T.sqr(w)) for w in parameters)

    gradients = T.grad(loss,wrt=parameters)
    logging.info("Parameters to tune:" + ', '.join(sorted(w.name for w in parameters)))

    update_vars = Parameters()
    logging.debug("Compiling functions...")    
Example #32
0
def make_train(image_size , word_size , first_hidden_size , proj_size , reg_lambda) :
    #initialize model
    P = Parameters()
    image_projecting = image_project.build(P, image_size, proj_size)
    batched_triplet_encoding , vector_triplet_encoding = triplet_encoding.build(P , word_size , first_hidden_size , proj_size)   

    image_vector = T.vector()

    #training
    correct_triplet =  [T.vector(dtype='float32') , T.vector(dtype='float32') , T.vector(dtype='float32')] #[E,R,E]
    negative_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')]

    image_projection_vector = image_projecting(image_vector)
    image_projection_matrix = repeat(image_projection_vector.dimshuffle(('x',0)) , negative_triplet[0].shape[0] , axis=0)
    correct_triplet_encoding_vector = vector_triplet_encoding(correct_triplet[0] , correct_triplet[1] , correct_triplet[2])
    negative_triplet_encoding_matrix = batched_triplet_encoding(negative_triplet[0] , negative_triplet[1] , negative_triplet[2])

    correct_cross_dot_scalar = T.dot(image_projection_vector , correct_triplet_encoding_vector)
    negative_cross_dot_vector = T.batched_dot(image_projection_matrix , negative_triplet_encoding_matrix)

    #margin cost
    zero_cost = T.zeros_like(negative_cross_dot_vector)
    margin_cost = 1 - correct_cross_dot_scalar + negative_cross_dot_vector
    cost_vector = T.switch(T.gt(zero_cost , margin_cost) , zero_cost , margin_cost)

    #regulizar cost
    params = P.values()
    l2 = T.sum(0)
    for p in params:
        l2 = l2 + (p ** 2).sum()        
    cost = T.sum(cost_vector)/T.shape(negative_triplet[0])[0] + reg_lambda * l2 #assume word vector has been put into P #unsolved
    grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)]

    lr = T.scalar(name='learning rate',dtype='float32')
    train = theano.function(
        inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2], lr],
        outputs=cost,
        updates=updates.rmsprop(params, grads, learning_rate=lr),
        allow_input_downcast=True
    )

    #valid
    valid = theano.function(
        inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2]],
        outputs=cost,
        allow_input_downcast=True

    )
    #testing
    all_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')]
    image_projection_matrix_test = repeat(image_projection_vector.dimshuffle(('x',0)) , all_triplet[0].shape[0] , axis=0)
    all_triplet_encoding_matrix = batched_triplet_encoding(all_triplet[0] , all_triplet[1] , all_triplet[2])
    all_cross_dot_vector = T.batched_dot(image_projection_matrix_test , all_triplet_encoding_matrix)

    test = theano.function(
        inputs=[image_vector, all_triplet[0], all_triplet[1], all_triplet[2]],
        outputs=all_cross_dot_vector,
        allow_input_downcast=True

    )

#default
    P_default = Parameters()
    P_default['left']     = 2 * (np.random.rand(word_size) - 0.5)
    P_default['right']    = 2 * (np.random.rand(word_size) - 0.5)
    P_default['relation'] = 2 * (np.random.rand(word_size) - 0.5)

    correct_triplet_d =  [T.vector(dtype='float32') , T.vector(dtype='float32') , T.vector(dtype='float32')] #[E,R,E]
    negative_triplet_d = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')]    

    correct_triplet_d_train = [correct_triplet_d,correct_triplet_d,correct_triplet_d]
    negative_triplet_d_train = [negative_triplet_d,negative_triplet_d,negative_triplet_d]

    cost = 0
    for i in range(3) :
        if i == 0 :
            correct_triplet_d_train[0]  = [correct_triplet_d[0],P_default['relation'],P_default['right']]
            negative_triplet_d_train[0] = [negative_triplet_d[0],repeat(P_default['relation'].dimshuffle(('x',0)),negative_triplet_d[0].shape[0] , axis=0),repeat(P_default['right'].dimshuffle(('x',0)),negative_triplet_d[0].shape[0] , axis=0)]
        elif i == 1 :
            correct_triplet_d_train[1]  = [P_default['left'],correct_triplet_d[1],P_default['right']]
            negative_triplet_d_train[1] = [repeat(P_default['left'].dimshuffle(('x',0)),negative_triplet_d[1].shape[0] , axis=0),negative_triplet_d[1],repeat(P_default['right'].dimshuffle(('x',0)),negative_triplet_d[1].shape[0] , axis=0)]
        elif i == 2 :
            correct_triplet_d_train[2]  = [P_default['left'],P_default['relation'],correct_triplet_d[2]]
            negative_triplet_d_train[2] = [repeat(P_default['left'].dimshuffle(('x',0)),negative_triplet_d[2].shape[0] , axis=0),repeat(P_default['relation'].dimshuffle(('x',0)),negative_triplet_d[2].shape[0] , axis=0),negative_triplet_d[2]]

        image_projection_matrix_d = repeat(image_projection_vector.dimshuffle(('x',0)) , negative_triplet_d[i].shape[0] , axis=0)
        correct_triplet_encoding_vector_d = vector_triplet_encoding(correct_triplet_d_train[i][0] , correct_triplet_d_train[i][1] , correct_triplet_d_train[i][2])
        negative_triplet_encoding_matrix_d = batched_triplet_encoding(negative_triplet_d_train[i][0] , negative_triplet_d_train[i][1] , negative_triplet_d_train[i][2])

        correct_cross_dot_scalar_d = T.dot(image_projection_vector , correct_triplet_encoding_vector_d)
        negative_cross_dot_vector_d = T.batched_dot(image_projection_matrix_d , negative_triplet_encoding_matrix_d)

        #margin cost
        zero_cost_d = T.zeros_like(negative_cross_dot_vector_d)
        margin_cost_d = 1 - correct_cross_dot_scalar_d + negative_cross_dot_vector_d
        cost_vector_d = T.switch(T.gt(zero_cost_d , margin_cost_d) , zero_cost_d , margin_cost_d)        

        cost = cost + T.sum(cost_vector_d)/T.shape(negative_triplet[i])[0]

    params_d = P_default.values()
    l2 = T.sum(0)
    for p in params_d:
        l2 = l2 + (p ** 2).sum()
    cost = cost + 0.01*l2

    grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params_d)]

    train_default = theano.function(
        inputs=[image_vector, correct_triplet_d[0], correct_triplet_d[1], correct_triplet_d[2], negative_triplet_d[0], negative_triplet_d[1], negative_triplet_d[2], lr],
        outputs=cost,
        updates=updates.rmsprop(params_d, grads, learning_rate=lr),
        allow_input_downcast=True
    )

    all_triplet_d = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')]
    all_triplet_d_test = [all_triplet_d,all_triplet_d,all_triplet_d]
    result = [[],[],[]]
    for i in range(3) :
        image_projection_matrix_test_d = repeat(image_projection_vector.dimshuffle(('x',0)) , all_triplet[i].shape[0] , axis=0)
        if i == 0 :
            all_triplet_d_test[0] = [all_triplet_d[0],repeat(P_default['relation'].dimshuffle(('x',0)),all_triplet_d[0].shape[0] , axis=0),repeat(P_default['right'].dimshuffle(('x',0)),all_triplet_d[0].shape[0] , axis=0)]
        elif i == 1 :
            all_triplet_d_test[1] = [repeat(P_default['left'].dimshuffle(('x',0)),all_triplet_d[1].shape[0] , axis=0),all_triplet_d[1],repeat(P_default['right'].dimshuffle(('x',0)),all_triplet_d[1].shape[0] , axis=0)]
        elif i == 2 :
            all_triplet_d_test[2] = [repeat(P_default['left'].dimshuffle(('x',0)),all_triplet_d[2].shape[0] , axis=0),repeat(P_default['relation'].dimshuffle(('x',0)),all_triplet_d[2].shape[0] , axis=0),all_triplet_d[2]]

        all_triplet_encoding_matrix_d = batched_triplet_encoding(all_triplet_d_test[i][0] , all_triplet_d_test[i][1] , all_triplet_d_test[i][2])
        result[i] = T.batched_dot(image_projection_matrix_test_d , all_triplet_encoding_matrix_d)

    test_default = theano.function(
        inputs=[image_vector, all_triplet_d[0], all_triplet_d[1], all_triplet_d[2]],
        outputs=result,
        allow_input_downcast=True

    )


    return P , P_default , train , valid , test , train_default , test_default
Example #33
0
        forget_gate = T.nnet.sigmoid(forget_lin)
        cell_updates = T.tanh(cell_lin)

        cell = forget_gate * prev_cell + in_gate * cell_updates

        out_lin = x_o + h_o + b_o + T.dot(cell, V_o)
        out_gate = T.nnet.sigmoid(out_lin)

        hid = out_gate * T.tanh(cell)
        return cell, hid

    return step


if __name__ == "__main__":
    P = Parameters()
    X = T.ivector("X")
    P.V = np.zeros((8, 8), dtype=np.int32)

    X_rep = P.V[X]
    P.W_output = np.zeros((15, 8), dtype=np.int32)
    lstm_layer = build(P, name="test", input_size=8, hidden_size=15)

    _, hidden = lstm_layer(X_rep)
    output = T.nnet.softmax(T.dot(hidden, P.W_output))
    delay = 5
    label = X[:-delay]
    predicted = output[delay:]

    cost = -T.sum(T.log(predicted[T.arange(predicted.shape[0]), label]))
    params = P.values()
Example #34
0
import theano.tensor as T
import numpy as np
from theano_toolkit.parameters import Parameters
import data_io
import model
import vae
import matplotlib
import sys
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

if __name__ == "__main__":
    model.SAMPLED_LAYERS = [int(s) for s in sys.argv[1:]]
    print model.SAMPLED_LAYERS
    P = Parameters()
    autoencoder, inpaint = model.build(P)

    parameters = P.values()
    X = T.itensor4('X')
    X_hat, posteriors, priors = \
        autoencoder(T.cast(X, 'float32') / np.float32(255.))
    latent_kls = [
        T.mean(vae.kl_divergence(po_m, po_s, pr_m, pr_s), axis=0)
        for (po_m, po_s), (pr_m, pr_s) in zip(posteriors, priors)
    ]
    recon_loss = model.cost(X_hat, X[:, :, 16:-16, 16:-16])
    val_loss = (recon_loss + sum(latent_kls)) / (32**2)

    X_recon = inpaint(T.cast(X, 'float32') / np.float32(255.))
    Y = model.predict(X_recon)
Example #35
0
import theano
import theano.tensor as T
import numpy as np
import sys

import data
import model
from theano_toolkit.parameters import Parameters
from theano_toolkit import updates


if __name__ == '__main__':
    model_filename = sys.argv[1]
    test_filename = sys.argv[2]
    train_filename = sys.argv[3]
    P = Parameters()
    data_X, df = data.load_test(test_filename, train_filename)
    f = model.build(P,
        input_size=data_X.shape[1],
        hidden_sizes=[256, 128, 64, 32]
    )
    X = T.matrix('X')
    predict = theano.function(
        inputs=[X],
        outputs=f(X, test=True) > 0.5,
    )
    P.load(model_filename)
    output = predict(data_X) 
    print data_X.shape
    print output.shape
    print df.values.shape
Example #36
0
import theano.tensor as T
import numpy as np
from theano_toolkit import utils as U
from theano_toolkit import hinton
from theano_toolkit import updates
from theano_toolkit.parameters import Parameters

import ctc
import font
import lstm
from ocr import *

if __name__ == "__main__":
    import sys
    test_word = sys.argv[1]

    P = Parameters()
    X = T.matrix('X')

    predict = build_model(P, 8, 512, len(font.chars) + 1)
    probs = predict(X)
    test = theano.function(inputs=[X], outputs=probs)
    P.load('model.pkl')
    image = font.imagify(test_word)
    hinton.plot(image.astype(np.float32).T[::-1])
    y_seq = label_seq(test_word)
    probs = test(image)
    print " ", ' '.join(font.chars[i] if i < len(font.chars) else "_"
                        for i in np.argmax(probs, axis=1))
    hinton.plot(probs[:, y_seq].T, max_arr=1.)
Example #37
0
import theano
import theano.tensor as T
import numpy as np

import vocab
import model
from theano_toolkit.parameters import Parameters

if __name__ == "__main__":
    model_file = args.model_file
    temp_input = args.temperature
    id2char = pickle.load(args.vocab_file)
    char2id = vocab.load(args.vocab_file.name)
    prime_str = args.prime

    P = Parameters()
    sampler = model.build_sampler(P,
                                  character_count=len(char2id) + 1,
                                  embedding_size=20,
                                  hidden_size=100)
    P.load(model_file)
    temp = T.scalar('temp')
    char = T.iscalar('char')
    p_cell_1, p_hidden_1, p_cell_2, p_hidden_2 = T.vector(
        "p_cell_1"), T.vector("p_hidden_2"), T.vector("p_cell_2"), T.vector(
            "p_hidden_2")

    output, cell_1, hidden_1, cell_2, hidden_2 = sampler(
        temp, char, p_cell_1, p_hidden_1, p_cell_2, p_hidden_2)
    sample = theano.function(
        inputs=[temp, char, p_cell_1, p_hidden_1, p_cell_2, p_hidden_2],
Example #38
0
def crossentropy(output, Y):
    if output.owner.op == T.nnet.softmax_op:
        x = output.owner.inputs[0]
        k = T.max(x, axis=1, keepdims=True)
        sum_x = T.log(T.sum(T.exp(x - k), axis=1)) + k
        return -x[T.arange(x.shape[0]), Y] + sum_x
    else:
        return T.nnet.categorical_crossentropy(outputs, Y)


if __name__ == "__main__":
    config.parse_args()
    total_frames = sum(x.shape[0]
                       for x, _ in frame_label_data.training_stream())
    logging.info("Total frames: %d" % total_frames)
    P = Parameters()
    predict = model.build(P)

    X = T.matrix('X')
    Y = T.ivector('Y')
    _, outputs = predict(X)
    cross_entropy = T.mean(crossentropy(outputs, Y))
    parameters = P.values()
    loss = cross_entropy + \
            (0.5/total_frames) * sum(T.sum(T.sqr(w)) for w in parameters)

    gradients = T.grad(loss, wrt=parameters)
    logging.info("Parameters to tune:" +
                 ', '.join(sorted(w.name for w in parameters)))

    update_vars = Parameters()
                ) for p,g in zip(parameters,gradients) ]

def weight_norm(u,norm=1.9356):
    in_norm = T.sqrt(T.sum(T.sqr(u),axis=0))
    ratio = T.minimum(norm,in_norm) / (in_norm + 1e-8)
    return ratio * u


def normalise_weights(updates):
    return [ (
            p,
            weight_norm(u) if p.name.startswith('W') else u
        ) for p,u in updates ]

if __name__ == "__main__":
    P = Parameters()
    extract,_ = model.build(P, "vrnn")
    X = T.tensor3('X')
    l = T.ivector('l')
    [Z_prior_mean, Z_prior_std,
        Z_mean, Z_std, X_mean, X_std] = extract(X,l)

    parameters = P.values()
    batch_cost = model.cost(X, Z_prior_mean, Z_prior_std,
                      Z_mean, Z_std, X_mean, X_std,l)
    print "Calculating gradient..."
    print parameters
    batch_size = T.cast(X.shape[1],'float32')

    gradients = T.grad(batch_cost,wrt=parameters)
    gradients = [ g / batch_size for g in gradients ]
Example #40
0
                            pool_factor,
                            activation=activation):
    conv = build_conv_layer(P, name, input_size, output_size, rfield_size,
                            activation)

    def upsample(X):
        upsamp_X = T.nnet.abstract_conv.bilinear_upsampling(X, pool_factor)
        Y = conv(upsamp_X)
        return Y

    return upsample


if __name__ == "__main__":
    from theano_toolkit.parameters import Parameters
    P = Parameters()
    image_size = 10
    rfield_size = 5
    input_size = 1
    X = T.as_tensor_variable(
        np.random.randn(1, input_size, 4, 4).astype(np.float32))
    P.W = conv_weight_init(2, 1, 5)
    P.b = np.zeros(2)
    W = P.W
    b = P.b.dimshuffle('x', 0, 'x', 'x')

    upsamp_X_1 = T.zeros(
        (X.shape[0], X.shape[1], 2 * X.shape[2], 2 * X.shape[3]))
    upsamp_X_1 = T.set_subtensor(upsamp_X_1[:, :, ::2, ::2], X)
    upsamp_X_1 = T.inc_subtensor(upsamp_X_1[:, :, 1:-1:2, ::2],
                                 0.5 * (X[:, :, :-1] + X[:, :, 1:]))
Example #41
0
    predict = T.nnet.softmax(T.dot(hidden, W_hidden_output) + b_output)

    return X, predict


def label_seq(string):
    idxs = font.indexify(string)
    result = np.ones((len(idxs) * 2 + 1, ), dtype=np.int32) * -1
    result[np.arange(len(idxs)) * 2 + 1] = idxs
    print result
    return result


if __name__ == "__main__":
    P = Parameters()
    X = T.matrix('X')
    Y = T.ivector('Y')
    X, predict = build_model(P, X, 10, 10, 10)

    cost = ctc.cost(predict, Y)
    params = P.values()
    grad = T.grad(cost, wrt=params)
    train = theano.function(inputs=[X, Y],
                            outputs=cost,
                            updates=updates.adadelta(params, grad))

    for _ in xrange(10):
        print train(
            np.eye(10, dtype=np.float32)[::-1], np.arange(10, dtype=np.int32))
Example #42
0
import theano
import theano.tensor as T
import numpy as np

import vocab
import model
from theano_toolkit.parameters import Parameters

if __name__ == "__main__":
    model_file = args.model_file
    temp_input = args.temperature
    id2char = pickle.load(args.vocab_file)
    char2id = vocab.load(args.vocab_file.name)
    prime_str = args.prime

    P = Parameters()
    sampler = model.build_sampler(P,
                                  character_count=len(char2id) + 1,
                                  embedding_size=20,
                                  hidden_size=100
                                  )
    P.load(model_file)
    temp = T.scalar('temp')
    char = T.iscalar('char')
    p_cell_1, p_hidden_1, p_cell_2, p_hidden_2 = T.vector("p_cell_1"), T.vector("p_hidden_2"), T.vector("p_cell_2"), T.vector("p_hidden_2")

    output, cell_1, hidden_1, cell_2, hidden_2 = sampler(temp, char, p_cell_1, p_hidden_1, p_cell_2, p_hidden_2)
    sample = theano.function(
        inputs=[temp, char, p_cell_1, p_hidden_1, p_cell_2, p_hidden_2],
        outputs=[output, cell_1, hidden_1, cell_2, hidden_2]
    )
Example #43
0
class Model:
	"""
	Simple predictive model for forecasting words from
	sequence using LSTMs. Choose how many LSTMs to stack
	what size their memory should be, and how many
	words can be predicted.
	"""
	def __init__(self, hidden_size, input_size, vocab_size, stack_size=1, celltype=LSTM):

		# core layer in RNN/LSTM
		self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size)

		# add an embedding
		self.model.layers.insert(0, Embedding(vocab_size, input_size))

		# add a classifier:
		self.model.layers.append(Layer(hidden_size, vocab_size, activation = softmax))

                self.turing_params = Parameters()
		#init turing machine model
		self.turing_updates , self.turing_predict = turing_model.build(self.turing_params , hidden_size , vocab_size)

		# inputs are matrices of indices,
		# each row is a sentence, each column a timestep
		self._stop_word   = theano.shared(np.int32(999999999), name="stop word")
		self.for_how_long = T.ivector()
		self.input_mat = T.imatrix()
		self.priming_word = T.iscalar()
		self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024))

		# create symbolic variables for prediction:
		#change by darong #issue : what is greedy
		self.lstm_predictions = self.create_lstm_prediction()
		self.final_predictions = self.create_final_prediction()

		# create symbolic variable for greedy search:
		self.greedy_predictions = self.create_lstm_prediction(greedy=True)

		# create gradient training functions:
		self.create_cost_fun()#create 2 cost func(lstm final)

		self.lstm_lr = 0.01
		self.turing_lr = 0.01
		self.all_lr = 0.01
		self.create_training_function()#create 3 functions(lstm turing all)
		self.create_predict_function()#create 2 predictions(lstm final)

		# create ppl
		self.lstm_ppl = self.create_lstm_ppl()
		self.final_ppl = self.create_final_ppl()
		self.create_ppl_function()


	def save(self, save_file, vocab):
		pickle.dump(self.model, open(save_file, "wb")) # pickle is for lambda function, cPickle cannot
		pickle.dump(vocab, open(save_file+'.vocab', "wb")) # pickle is for lambda function, cPickle cannot
	def save_turing(self, save_file):
		self.turing_params.save(save_file + '.turing')


	def load(self, load_file, lr):
		self.model = pickle.load(open(load_file, "rb"))
		if os.path.isfile(load_file + '.turing') :
			self.turing_params.load(load_file + '.turing')			
		else :
			print "no turing model!!!! pretrain with lstm param"
			self.turing_params['W_input_hidden'] = self.model.layers[-1].params[0].get_value().T #not sure
			self.turing_params['W_read_hidden']  = self.model.layers[-1].params[0].get_value().T
			self.turing_params['b_hidden_0'] = self.model.layers[-1].params[1].get_value()
                        temp = self.model.layers[1].initial_hidden_state.get_value()[self.hidden_size:]
			self.turing_params['memory_init'] = temp.reshape((1,)+temp.shape)

		# need to compile again for calculating predictions after loading lstm
		self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024))
		self.lstm_predictions = self.create_lstm_prediction()
		self.final_predictions = self.create_final_prediction()
		self.greedy_predictions = self.create_lstm_prediction(greedy=True)#can change to final
		self.create_cost_fun()#create 2 cost func(lstm final)
		self.lstm_lr = lr
		self.turing_lr = lr#change this
		self.all_lr = lr
		self.create_training_function()#create 3 functions(lstm turing all)
		self.create_predict_function()#create 2 predictions(lstm final)
		self.lstm_ppl = self.create_lstm_ppl()
		self.final_ppl = self.create_final_ppl()
		self.create_ppl_function()
		print "done loading model"
#		print "done compile"


	def stop_on(self, idx):
		self._stop_word.set_value(idx)
		
	@property
	def params(self):
		return self.model.params
								 
	def create_lstm_prediction(self, greedy=False):
		def step(idx, *states):
			# new hiddens are the states we need to pass to LSTMs
			# from past. Because the StackedCells also include
			# the embeddings, and those have no state, we pass
			# a "None" instead:
			new_hiddens = [None] + list(states)
			
			new_states = self.model.forward(idx, prev_hiddens = new_hiddens)
			if greedy:
				new_idxes = new_states[-1]
				new_idx   = new_idxes.argmax()
				# provide a stopping condition for greedy search:
				return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word))
			else:
				return new_states[1:]

		# in sequence forecasting scenario we take everything
		# up to the before last step, and predict subsequent
		# steps ergo, 0 ... n - 1, hence:
		inputs = self.input_mat[:, 0:-1]
		num_examples = inputs.shape[0]
		# pass this to Theano's recurrence relation function:
		
		# choose what gets outputted at each timestep:
		if greedy:
			outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]]
			result, _ = theano.scan(fn=step,
								n_steps=200,
								outputs_info=outputs_info)
		else:
			outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]]
			result, _ = theano.scan(fn=step,
								sequences=[inputs.T],
								outputs_info=outputs_info)
								 
		if greedy:
			return result[0]
		# softmaxes are the last layer of our network,
		# and are at the end of our results list:
		return result[-1].transpose((2,0,1))
		# we reorder the predictions to be:
		# 1. what row / example
		# 2. what timestep
		# 3. softmax dimension

	def create_final_prediction(self, greedy=False):
		def step(idx, *states):
			# new hiddens are the states we need to pass to LSTMs
			# from past. Because the StackedCells also include
			# the embeddings, and those have no state, we pass
			# a "None" instead:
			new_hiddens = [None] + list(states)
			
			new_states = self.model.forward(idx, prev_hiddens = new_hiddens)
			if greedy:
				new_idxes = new_states[-1]
				new_idx   = new_idxes.argmax()
				# provide a stopping condition for greedy search:
				return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word))
			else:
				return new_states[1:]

		# in sequence forecasting scenario we take everything
		# up to the before last step, and predict subsequent
		# steps ergo, 0 ... n - 1, hence:
		inputs = self.input_mat[:, 0:-1]
		num_examples = inputs.shape[0]
		# pass this to Theano's recurrence relation function:
		
		# choose what gets outputted at each timestep:
		if greedy:
			outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]]
			result, _ = theano.scan(fn=step,
								n_steps=200,
								outputs_info=outputs_info)
		else:
			outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]]
			result, _ = theano.scan(fn=step,
								sequences=[inputs.T],
								outputs_info=outputs_info)
								 
		if greedy:
			return result[0]
		# softmaxes are the last layer of our network,
		# and are at the end of our results list:
                hidden_size = result[-2].shape[2]/2
		turing_result = self.turing_predict(result[-2][:,:,hidden_size:]) 
		#the last layer do transpose before compute
		return turing_result.transpose((1,0,2))
		# we reorder the predictions to be:
		# 1. what row / example
		# 2. what timestep
		# 3. softmax dimension	
								 
	def create_cost_fun (self):

		# create a cost function that
		# takes each prediction at every timestep
		# and guesses next timestep's value:
		what_to_predict = self.input_mat[:, 1:]
		# because some sentences are shorter, we
		# place masks where the sentences end:
		# (for how long is zero indexed, e.g. an example going from `[2,3)`)
		# has this value set 0 (here we substract by 1):
		for_how_long = self.for_how_long - 1
		# all sentences start at T=0:
		starting_when = T.zeros_like(self.for_how_long)
								 
		self.lstm_cost = masked_loss(self.lstm_predictions,
								what_to_predict,
								for_how_long,
								starting_when).sum()

		self.final_cost = masked_loss(self.final_predictions,
								what_to_predict,
								for_how_long,
								starting_when).sum()
		
	def create_predict_function(self):
		self.lstm_pred_fun = theano.function(
			inputs=[self.input_mat],
			outputs=self.lstm_predictions,
			allow_input_downcast=True
		)
		self.final_pred_fun = theano.function(
			inputs=[self.input_mat],
			outputs=self.final_predictions,
			allow_input_downcast=True
		)
		
		self.greedy_fun = theano.function(
			inputs=[self.priming_word],
			outputs=T.concatenate([T.shape_padleft(self.priming_word), self.greedy_predictions]),
			allow_input_downcast=True
		)
								 
	def create_training_function(self):
		updates, _, _, _, _ = create_optimization_updates(self.lstm_cost, self.params, method="SGD", lr=self.lstm_lr)
#		updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr)
		self.lstm_update_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.lstm_cost,
			updates=updates,
			allow_input_downcast=True)

		updates_turing = self.turing_updates(self.final_cost , lr=self.turing_lr)
#		updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr)
		self.turing_update_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.final_cost,
			updates=updates_turing,
                        mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True),
			allow_input_downcast=True)

		all_updates_lstm, _, _, _, _ = create_optimization_updates(self.final_cost, self.params, method="SGD", lr=self.all_lr,part=True)
		all_updates_turing_temp = self.turing_updates(self.final_cost , lr=self.all_lr)
                updates_all = all_updates_lstm
                for pair in all_updates_turing_temp :
                    updates_all[pair[0]] = pair[1]

		self.all_update_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.final_cost,
			updates=updates_all,
			allow_input_downcast=True)

	def create_lstm_ppl(self):

		def timestep(predictions, label, len_example, total_len_example):

			label_binary = T.gt(label[0:len_example-1], 0)
			oov_count = T.shape(label_binary)[0] - T.sum(label_binary)
			
			a = total_len_example
			return T.sum(T.log( 1./ predictions[T.arange(len_example-1), label[0:len_example-1]]) * label_binary ), oov_count


		result, _ = theano.scan(fn=timestep,
						   sequences=[ self.lstm_predictions, self.input_mat[:, 1:], self.for_how_long ],
						   non_sequences=T.sum(self.for_how_long))

		oov_count_total = T.sum(result[1])
		return T.exp(T.sum(result[0]).astype(theano.config.floatX)/(T.sum(self.for_how_long) - oov_count_total).astype(theano.config.floatX)).astype(theano.config.floatX)

	def create_final_ppl(self):

		def timestep(predictions, label, len_example, total_len_example):

			label_binary = T.gt(label[0:len_example-1], 0)
			oov_count = T.shape(label_binary)[0] - T.sum(label_binary)
			
			a = total_len_example
			return T.sum(T.log( 1./ predictions[T.arange(len_example-1), label[0:len_example-1]]) * label_binary ), oov_count


		result, _ = theano.scan(fn=timestep,
						   sequences=[ self.final_predictions, self.input_mat[:, 1:], self.for_how_long ],
						   non_sequences=T.sum(self.for_how_long))

		oov_count_total = T.sum(result[1])
		return T.exp(T.sum(result[0]).astype(theano.config.floatX)/(T.sum(self.for_how_long) - oov_count_total).astype(theano.config.floatX)).astype(theano.config.floatX)

	def create_ppl_function(self):
		self.lstm_ppl_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.lstm_ppl,
			allow_input_downcast=True)

		self.final_ppl_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.final_ppl,
			allow_input_downcast=True)
		
		
	def __call__(self, x):
		return self.pred_fun(x)#any problem??
Example #44
0
    max_epochs = args.max_epochs
    batch_size = args.batch_size
    improvement_threshold = args.improvement_threshold
    validation_percent = args.validation_percent
    patience = args.patience
    checkpoint = args.checkpoint

    embedding_size = args.embedding_size
    hidden_size = args.hidden_size

    l2_coefficient = args.l2

    id2char = pickle.load(open(vocab_file,'r'))
    char2id = vocab.load(vocab_file)
    P = Parameters()
    lang_model = model.build(P,
                             character_count=len(char2id) + 1,
                             embedding_size=embedding_size,
                             hidden_size=hidden_size
                             )

    def cost(X, P): # batch_size x time
        eps = 1e-3
        X = X.T                                         # time x batch_size
        char_prob_dist = lang_model(X[:-1])                # time x batch_size x output_size
        char_prob_dist = (1 - 2 * eps) * char_prob_dist + eps
        label_prob = char_prob_dist[
            T.arange(X.shape[0] - 1).dimshuffle(0, 'x'),
            T.arange(X.shape[1]).dimshuffle('x', 0),
            X[1:]
Example #45
0
def make_train(image_size , word_size , first_hidden_size , proj_size , reg_lambda) :
    #initialize model
    P = Parameters()
    image_projecting = image_project.build(P, image_size, proj_size)
    batched_triplet_encoding , vector_triplet_encoding = triplet_encoding.build(P , word_size , first_hidden_size , proj_size)   

    image_vector = T.vector()

    #training
    correct_triplet =  [T.vector(dtype='float32') , T.vector(dtype='float32') , T.vector(dtype='float32')] #[E,R,E]
    negative_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')]

    image_projection_vector = image_projecting(image_vector)
    image_projection_matrix = repeat(image_projection_vector.dimshuffle(('x',0)) , negative_triplet[0].shape[0] , axis=0)
    correct_triplet_encoding_vector = vector_triplet_encoding(correct_triplet[0] , correct_triplet[1] , correct_triplet[2])
    negative_triplet_encoding_matrix = batched_triplet_encoding(negative_triplet[0] , negative_triplet[1] , negative_triplet[2])

    correct_cross_dot_scalar = T.dot(image_projection_vector , correct_triplet_encoding_vector)
    negative_cross_dot_vector = T.batched_dot(image_projection_matrix , negative_triplet_encoding_matrix)

    #margin cost
    zero_cost = T.zeros_like(negative_cross_dot_vector)
    margin_cost = 1 - correct_cross_dot_scalar + negative_cross_dot_vector
    cost_vector = T.switch(T.gt(zero_cost , margin_cost) , zero_cost , margin_cost)

    #regulizar cost
    params = P.values()
    l2 = T.sum(0)
    for p in params:
        l2 = l2 + (p ** 2).sum()        
    cost = T.sum(cost_vector)/T.shape(negative_triplet[0])[0] + reg_lambda * l2 #assume word vector has been put into P #unsolved
    grads = [T.clip(g, -100, 100) for g in T.grad(cost, wrt=params)]

    lr = T.scalar(name='learning rate',dtype='float32')
    train = theano.function(
        inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2], lr],
        outputs=cost,
        updates=updates.rmsprop(params, grads, learning_rate=lr),
        allow_input_downcast=True
    )

    #valid
    valid = theano.function(
        inputs=[image_vector, correct_triplet[0], correct_triplet[1], correct_triplet[2], negative_triplet[0], negative_triplet[1], negative_triplet[2]],
        outputs=cost,
        allow_input_downcast=True

    )
    #visualize
    image_project_fun = theano.function(
        inputs=[image_vector],
        outputs=image_projection_vector,
        allow_input_downcast=True
    )
    #testing
    all_triplet = [T.matrix(dtype='float32') , T.matrix(dtype='float32') , T.matrix(dtype='float32')]
    image_projection_matrix_test = repeat(image_projection_vector.dimshuffle(('x',0)) , all_triplet[0].shape[0] , axis=0)
    all_triplet_encoding_matrix = batched_triplet_encoding(all_triplet[0] , all_triplet[1] , all_triplet[2])
    all_cross_dot_vector = T.batched_dot(image_projection_matrix_test , all_triplet_encoding_matrix)

    test = theano.function(
        inputs=[image_vector, all_triplet[0], all_triplet[1], all_triplet[2]],
        outputs=all_cross_dot_vector,
        allow_input_downcast=True

    )

    return P , train , valid , image_project_fun , test
Example #46
0
    # TODO: fix these magic numbers (especially the 800)
    def f(X):
        layer0 = X.reshape((X.shape[0], 1, 28, 28))
        layer1 = _build_conv_pool(P, 1, layer0, 20,  1, 5, 2)
        layer2_= _build_conv_pool(P, 2, layer1, 50, 20, 5, 2)
        layer2 = layer2_.flatten(2)
        output = T.nnet.softmax(T.dot(layer2, P.W_hidden_output) + P.b_output)
        return output

    return f

def cost(P, Y_hat, Y, l2 = 0):
    return (T.mean(T.nnet.categorical_crossentropy(Y_hat, Y)) +
           l2 * sum(T.mean(p**2) for p in P.values()))

if __name__ == "__main__":
    import datasets
    x,y = datasets.mnist()
    x,y = x[0:1000],y[0:1000]

    P = Parameters()
    X = T.matrix('X')
    Y = T.ivector('Y')
    net = build(P, 784, 800, 10)
    Y_hat = net(X)
    
    f = theano.function(inputs = [X], outputs = Y_hat)
    J = cost(P, Y_hat, Y)
    grad = T.grad(J, wrt=P.values())
Example #47
0
    ]


def weight_norm(u, norm=1.9356):
    in_norm = T.sqrt(T.sum(T.sqr(u), axis=0))
    ratio = T.minimum(norm, in_norm) / (in_norm + 1e-8)
    return ratio * u


def normalise_weights(updates):
    return [(p, weight_norm(u) if p.name.startswith('W') else u)
            for p, u in updates]


if __name__ == "__main__":
    P = Parameters()
    extract, _ = model.build(P, "vrnn")
    X = T.tensor3('X')
    l = T.ivector('l')
    [Z_prior_mean, Z_prior_std, Z_mean, Z_std, X_mean, X_std] = extract(X, l)

    parameters = P.values()
    batch_cost = model.cost(X, Z_prior_mean, Z_prior_std, Z_mean, Z_std,
                            X_mean, X_std, l)
    print "Calculating gradient..."
    print parameters
    batch_size = T.cast(X.shape[1], 'float32')

    gradients = T.grad(batch_cost, wrt=parameters)
    gradients = [g / batch_size for g in gradients]
    gradients = clip(5, parameters, gradients)
Example #48
0
        forget_gate = T.nnet.sigmoid(forget_lin)
        cell_updates = T.tanh(cell_lin)

        cell = forget_gate * prev_cell + in_gate * cell_updates

        out_lin = x_o + h_o + b_o + T.dot(cell, V_o)
        out_gate = T.nnet.sigmoid(out_lin)

        hid = out_gate * T.tanh(cell)
        return cell, hid

    return step


if __name__ == "__main__":
    P = Parameters()
    X = T.ivector('X')
    P.V = np.zeros((8, 8), dtype=np.int32)

    X_rep = P.V[X]
    P.W_output = np.zeros((15, 8), dtype=np.int32)
    lstm_layer = build(P, name="test", input_size=8, hidden_size=15)

    _, hidden = lstm_layer(X_rep)
    output = T.nnet.softmax(T.dot(hidden, P.W_output))
    delay = 5
    label = X[:-delay]
    predicted = output[delay:]

    cost = -T.sum(T.log(predicted[T.arange(predicted.shape[0]), label]))
    params = P.values()