Beispiel #1
0
class Model:
	"""
	Simple predictive model for forecasting words from
	sequence using LSTMs. Choose how many LSTMs to stack
	what size their memory should be, and how many
	words can be predicted.
	"""
	def __init__(self, hidden_size, input_size, vocab_size, stack_size=1, celltype=LSTM):

		# core layer in RNN/LSTM
		self.model = StackedCells(input_size, celltype=celltype, layers =[hidden_size] * stack_size)

		# add an embedding
		self.model.layers.insert(0, Embedding(vocab_size, input_size))

		# add a classifier:
		self.model.layers.append(Layer(hidden_size, vocab_size, activation = softmax))

                self.turing_params = Parameters()
		#init turing machine model
		self.turing_updates , self.turing_predict = turing_model.build(self.turing_params , hidden_size , vocab_size)

		# inputs are matrices of indices,
		# each row is a sentence, each column a timestep
		self._stop_word   = theano.shared(np.int32(999999999), name="stop word")
		self.for_how_long = T.ivector()
		self.input_mat = T.imatrix()
		self.priming_word = T.iscalar()
		self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024))

		# create symbolic variables for prediction:
		#change by darong #issue : what is greedy
		self.lstm_predictions = self.create_lstm_prediction()
		self.final_predictions = self.create_final_prediction()

		# create symbolic variable for greedy search:
		self.greedy_predictions = self.create_lstm_prediction(greedy=True)

		# create gradient training functions:
		self.create_cost_fun()#create 2 cost func(lstm final)

		self.lstm_lr = 0.01
		self.turing_lr = 0.01
		self.all_lr = 0.01
		self.create_training_function()#create 3 functions(lstm turing all)
		self.create_predict_function()#create 2 predictions(lstm final)

		# create ppl
		self.lstm_ppl = self.create_lstm_ppl()
		self.final_ppl = self.create_final_ppl()
		self.create_ppl_function()


	def save(self, save_file, vocab):
		pickle.dump(self.model, open(save_file, "wb")) # pickle is for lambda function, cPickle cannot
		pickle.dump(vocab, open(save_file+'.vocab', "wb")) # pickle is for lambda function, cPickle cannot
	def save_turing(self, save_file):
		self.turing_params.save(save_file + '.turing')


	def load(self, load_file, lr):
		self.model = pickle.load(open(load_file, "rb"))
		if os.path.isfile(load_file + '.turing') :
			self.turing_params.load(load_file + '.turing')			
		else :
			print "no turing model!!!! pretrain with lstm param"
			self.turing_params['W_input_hidden'] = self.model.layers[-1].params[0].get_value().T #not sure
			self.turing_params['W_read_hidden']  = self.model.layers[-1].params[0].get_value().T
			self.turing_params['b_hidden_0'] = self.model.layers[-1].params[1].get_value()
                        temp = self.model.layers[1].initial_hidden_state.get_value()[self.hidden_size:]
			self.turing_params['memory_init'] = temp.reshape((1,)+temp.shape)

		# need to compile again for calculating predictions after loading lstm
		self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024))
		self.lstm_predictions = self.create_lstm_prediction()
		self.final_predictions = self.create_final_prediction()
		self.greedy_predictions = self.create_lstm_prediction(greedy=True)#can change to final
		self.create_cost_fun()#create 2 cost func(lstm final)
		self.lstm_lr = lr
		self.turing_lr = lr#change this
		self.all_lr = lr
		self.create_training_function()#create 3 functions(lstm turing all)
		self.create_predict_function()#create 2 predictions(lstm final)
		self.lstm_ppl = self.create_lstm_ppl()
		self.final_ppl = self.create_final_ppl()
		self.create_ppl_function()
		print "done loading model"
#		print "done compile"


	def stop_on(self, idx):
		self._stop_word.set_value(idx)
		
	@property
	def params(self):
		return self.model.params
								 
	def create_lstm_prediction(self, greedy=False):
		def step(idx, *states):
			# new hiddens are the states we need to pass to LSTMs
			# from past. Because the StackedCells also include
			# the embeddings, and those have no state, we pass
			# a "None" instead:
			new_hiddens = [None] + list(states)
			
			new_states = self.model.forward(idx, prev_hiddens = new_hiddens)
			if greedy:
				new_idxes = new_states[-1]
				new_idx   = new_idxes.argmax()
				# provide a stopping condition for greedy search:
				return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word))
			else:
				return new_states[1:]

		# in sequence forecasting scenario we take everything
		# up to the before last step, and predict subsequent
		# steps ergo, 0 ... n - 1, hence:
		inputs = self.input_mat[:, 0:-1]
		num_examples = inputs.shape[0]
		# pass this to Theano's recurrence relation function:
		
		# choose what gets outputted at each timestep:
		if greedy:
			outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]]
			result, _ = theano.scan(fn=step,
								n_steps=200,
								outputs_info=outputs_info)
		else:
			outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]]
			result, _ = theano.scan(fn=step,
								sequences=[inputs.T],
								outputs_info=outputs_info)
								 
		if greedy:
			return result[0]
		# softmaxes are the last layer of our network,
		# and are at the end of our results list:
		return result[-1].transpose((2,0,1))
		# we reorder the predictions to be:
		# 1. what row / example
		# 2. what timestep
		# 3. softmax dimension

	def create_final_prediction(self, greedy=False):
		def step(idx, *states):
			# new hiddens are the states we need to pass to LSTMs
			# from past. Because the StackedCells also include
			# the embeddings, and those have no state, we pass
			# a "None" instead:
			new_hiddens = [None] + list(states)
			
			new_states = self.model.forward(idx, prev_hiddens = new_hiddens)
			if greedy:
				new_idxes = new_states[-1]
				new_idx   = new_idxes.argmax()
				# provide a stopping condition for greedy search:
				return ([new_idx.astype(self.priming_word.dtype)] + new_states[1:-1]), theano.scan_module.until(T.eq(new_idx,self._stop_word))
			else:
				return new_states[1:]

		# in sequence forecasting scenario we take everything
		# up to the before last step, and predict subsequent
		# steps ergo, 0 ... n - 1, hence:
		inputs = self.input_mat[:, 0:-1]
		num_examples = inputs.shape[0]
		# pass this to Theano's recurrence relation function:
		
		# choose what gets outputted at each timestep:
		if greedy:
			outputs_info = [dict(initial=self.priming_word, taps=[-1])] + [initial_state_with_taps(layer) for layer in self.model.layers[1:-1]]
			result, _ = theano.scan(fn=step,
								n_steps=200,
								outputs_info=outputs_info)
		else:
			outputs_info = [initial_state_with_taps(layer, num_examples) for layer in self.model.layers[1:]]
			result, _ = theano.scan(fn=step,
								sequences=[inputs.T],
								outputs_info=outputs_info)
								 
		if greedy:
			return result[0]
		# softmaxes are the last layer of our network,
		# and are at the end of our results list:
                hidden_size = result[-2].shape[2]/2
		turing_result = self.turing_predict(result[-2][:,:,hidden_size:]) 
		#the last layer do transpose before compute
		return turing_result.transpose((1,0,2))
		# we reorder the predictions to be:
		# 1. what row / example
		# 2. what timestep
		# 3. softmax dimension	
								 
	def create_cost_fun (self):

		# create a cost function that
		# takes each prediction at every timestep
		# and guesses next timestep's value:
		what_to_predict = self.input_mat[:, 1:]
		# because some sentences are shorter, we
		# place masks where the sentences end:
		# (for how long is zero indexed, e.g. an example going from `[2,3)`)
		# has this value set 0 (here we substract by 1):
		for_how_long = self.for_how_long - 1
		# all sentences start at T=0:
		starting_when = T.zeros_like(self.for_how_long)
								 
		self.lstm_cost = masked_loss(self.lstm_predictions,
								what_to_predict,
								for_how_long,
								starting_when).sum()

		self.final_cost = masked_loss(self.final_predictions,
								what_to_predict,
								for_how_long,
								starting_when).sum()
		
	def create_predict_function(self):
		self.lstm_pred_fun = theano.function(
			inputs=[self.input_mat],
			outputs=self.lstm_predictions,
			allow_input_downcast=True
		)
		self.final_pred_fun = theano.function(
			inputs=[self.input_mat],
			outputs=self.final_predictions,
			allow_input_downcast=True
		)
		
		self.greedy_fun = theano.function(
			inputs=[self.priming_word],
			outputs=T.concatenate([T.shape_padleft(self.priming_word), self.greedy_predictions]),
			allow_input_downcast=True
		)
								 
	def create_training_function(self):
		updates, _, _, _, _ = create_optimization_updates(self.lstm_cost, self.params, method="SGD", lr=self.lstm_lr)
#		updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr)
		self.lstm_update_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.lstm_cost,
			updates=updates,
			allow_input_downcast=True)

		updates_turing = self.turing_updates(self.final_cost , lr=self.turing_lr)
#		updates, _, _, _, _ = create_optimization_updates(self.cost, self.params, method="adadelta", lr=self.lr)
		self.turing_update_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.final_cost,
			updates=updates_turing,
                        mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=True),
			allow_input_downcast=True)

		all_updates_lstm, _, _, _, _ = create_optimization_updates(self.final_cost, self.params, method="SGD", lr=self.all_lr,part=True)
		all_updates_turing_temp = self.turing_updates(self.final_cost , lr=self.all_lr)
                updates_all = all_updates_lstm
                for pair in all_updates_turing_temp :
                    updates_all[pair[0]] = pair[1]

		self.all_update_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.final_cost,
			updates=updates_all,
			allow_input_downcast=True)

	def create_lstm_ppl(self):

		def timestep(predictions, label, len_example, total_len_example):

			label_binary = T.gt(label[0:len_example-1], 0)
			oov_count = T.shape(label_binary)[0] - T.sum(label_binary)
			
			a = total_len_example
			return T.sum(T.log( 1./ predictions[T.arange(len_example-1), label[0:len_example-1]]) * label_binary ), oov_count


		result, _ = theano.scan(fn=timestep,
						   sequences=[ self.lstm_predictions, self.input_mat[:, 1:], self.for_how_long ],
						   non_sequences=T.sum(self.for_how_long))

		oov_count_total = T.sum(result[1])
		return T.exp(T.sum(result[0]).astype(theano.config.floatX)/(T.sum(self.for_how_long) - oov_count_total).astype(theano.config.floatX)).astype(theano.config.floatX)

	def create_final_ppl(self):

		def timestep(predictions, label, len_example, total_len_example):

			label_binary = T.gt(label[0:len_example-1], 0)
			oov_count = T.shape(label_binary)[0] - T.sum(label_binary)
			
			a = total_len_example
			return T.sum(T.log( 1./ predictions[T.arange(len_example-1), label[0:len_example-1]]) * label_binary ), oov_count


		result, _ = theano.scan(fn=timestep,
						   sequences=[ self.final_predictions, self.input_mat[:, 1:], self.for_how_long ],
						   non_sequences=T.sum(self.for_how_long))

		oov_count_total = T.sum(result[1])
		return T.exp(T.sum(result[0]).astype(theano.config.floatX)/(T.sum(self.for_how_long) - oov_count_total).astype(theano.config.floatX)).astype(theano.config.floatX)

	def create_ppl_function(self):
		self.lstm_ppl_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.lstm_ppl,
			allow_input_downcast=True)

		self.final_ppl_fun = theano.function(
			inputs=[self.input_mat, self.for_how_long],
			outputs=self.final_ppl,
			allow_input_downcast=True)
		
		
	def __call__(self, x):
		return self.pred_fun(x)#any problem??
Beispiel #2
0
    gradients = T.grad(cost, wrt=params)

    gradient_acc = [theano.shared(0 * p.get_value()) for p in params]
    counter = theano.shared(np.float32(0.))
    acc = theano.function(inputs=[X, Y],
                          outputs=cost,
                          updates=[(a, a + g)
                                   for a, g in zip(gradient_acc, gradients)] +
                          [(counter, counter + np.float32(1.))])
    update = theano.function(
            inputs=[],outputs=[],
            updates = updates.momentum(params,[ g / counter for g in gradient_acc ]) \
                    + [ (a, np.float32(0) * a) for a in gradient_acc ] \
                    + [ (counter,np.float32(0.)) ]
        )

    test = theano.function(inputs=[X, Y], outputs=probs[:, Y])

    training_examples = [word.strip() for word in open('dictionary.txt')]
    import random
    for _ in xrange(1500):
        random.shuffle(training_examples)
        for i, string in enumerate(training_examples):
            print acc(font.imagify(string), label_seq(string))
            if i % 20 == 0: update()
            if i % 100 == 0:
                hinton.plot(test(font.imagify("test"), label_seq("test")).T,
                            max_arr=1.)
                hinton.plot(font.imagify("test").T[::-1].astype('float32'))
        P.save('model.pkl')
Beispiel #3
0
    acc = theano.function(
            inputs=[X, Y],
            outputs=cost,
            updates = [
                (a,a + g) for a,g in zip(gradient_acc,gradients)
            ] + [(counter,counter + np.float32(1.))]
        )
    update = theano.function(
            inputs=[],outputs=[],
            updates = updates.momentum(params,[ g / counter for g in gradient_acc ]) \
                    + [ (a, np.float32(0) * a) for a in gradient_acc ] \
                    + [ (counter,np.float32(0.)) ]
        )

    test = theano.function(
            inputs=[X,Y],
            outputs=probs[:,Y]
        )

    training_examples = [ word.strip() for word in open('dictionary.txt') ]
    import random
    for _ in xrange(1500):
        random.shuffle(training_examples)
        for i,string in enumerate(training_examples):
            print acc(font.imagify(string),label_seq(string))
            if i % 20 == 0: update()
            if i % 100 == 0:
                hinton.plot(test(font.imagify("test"),label_seq("test")).T,max_arr=1.)
                hinton.plot(font.imagify("test").T[::-1].astype('float32'))
        P.save('model.pkl')
Beispiel #4
0
		test_group_answers = islice(group_answers,test_instance_count)
		test_data = data_io.story_question_answer_idx(
						test_group_answers,
						vocab_in
					)
		test_data = ( x for x in test_data if x[1].shape[0] <= length_limit )
		tests = [ np.array(
					test(input_data,idxs,question_data,ans_w,ans_evds),
					dtype=np.float32
				)
				for input_data,idxs,question_data,ans_w,ans_evds in test_data ]
		errors = sum(tests)/len(tests)
		print "Error rate:",errors
		print "Starting epoch ",epoch
		if errors < best_error * 0.9 :
			P.save('model.pkl')
			print "Wrote model."
			best_error = errors
			length_limit += 2
		else:
#			learning_rate = learning_rate / 2
#			batch_size = max(1,batch_size//2)
#			print "Learning rate:",learning_rate
			P.save('tmp.model.pkl')
		buffer_size = 256 / batch_size

		train_group_answers = data_io.randomise(group_answers)
		training_data = data_io.story_question_answer_idx(train_group_answers,vocab_in)
		training_data = ( x for x in training_data if x[1].shape[0] <= length_limit )
		training_data = data_io.sortify(training_data,key=lambda x:x[1].shape[0])
		batched_training_data = data_io.batch(
Beispiel #5
0
    best_cost = np.inf
    increase_count = 0
    seen = 0
    for epoch in xrange(max_epochs):
        print "Epoch:", epoch + 1
        print "Batch size:", batch_size

        # Run test on validation set
        data_stream = data_io.stream(data_file, char2id)
        test_stream = islice(data_stream, validation_count)
        test_cost = test(test_stream)
        print "Perplexity:", test_cost

        if test_cost < improvement_threshold * best_cost:
            best_cost = test_cost
            P.save(output_file)
            increase_count = 0
        else:
            increase_count += 1
            if increase_count > patience:
                break

        # Run training
        data_stream = data_io.randomise(data_stream, buffer_size=1024)
        data_stream = data_io.sortify(data_stream, key=lambda x: len(x), buffer_size=512)
        batch_data_stream = data_io.batch(data_stream, batch_size=batch_size)
        batch_data_stream = data_io.randomise(batch_data_stream)

        for batch in batch_data_stream:
            avg_cost = train(batch)
            if np.isnan(avg_cost):
Beispiel #6
0
    _, hidden = lstm_layer(X_rep)
    output = T.nnet.softmax(T.dot(hidden, P.W_output))
    delay = 5
    label = X[:-delay]
    predicted = output[delay:]

    cost = -T.sum(T.log(predicted[T.arange(predicted.shape[0]), label]))
    params = P.values()
    gradients = T.grad(cost, wrt=params)

    update_methods = {
        'standard': [(p, p - 0.001 * g) for p, g in zip(params, gradients)],
        #			'rmsprop' : updates.rmsprop(params,gradients),
        #			'adadelta': updates.rmsprop(params,gradients),
    }
    P.save('init.pkl')
    for update_method in update_methods:
        print "Using update method:", update_method
        with open('train.%s.smart_init.log' % update_method, 'w') as log:

            train = theano.function(
                inputs=[X],
                outputs=cost,
                updates=update_methods[update_method],
            )

            P.load('init.pkl')

            while True:
                cost_val = train(
                    np.random.randint(0, 8, size=20).astype(np.int32))
Beispiel #7
0
        total = 0
        count = 0
        for chunk in stream:
            vals = test(chunk)
            loss = vals[0]
            if chunk.shape > 1:
                total += chunk.shape[0] * loss
                count += chunk.shape[0]
        print ' '.join(str(v) for v in vals),
        return total / count

    best_cost = np.inf
    for epoch in xrange(50):
        print "Epoch %d" % epoch,
        cost = validation()
        print cost,
        if cost < best_cost:
            print "Saving...."
            P.save('model.pkl')
            best_cost = cost
        else:
            print
        for chunk in data_stream():
            chunk_X.set_value(chunk)
            batches = int(math.floor(chunk.shape[0] / float(batch_size)))
            for i in xrange(batches):
                loss = train(i)
                print ' '.join(str(v) for v in loss)  # , show_betas()
                # pprint({p.name: g for p, g in zip(parameters, grad_norms)})
        P.save('unval_model.pkl')
Beispiel #8
0
	output = T.nnet.softmax(T.dot(hidden,P.W_output))
	delay = 5
	label = X[:-delay]
	predicted = output[delay:]

	cost = -T.sum(T.log(predicted[T.arange(predicted.shape[0]),label]))
	params = P.values()
	gradients = T.grad(cost,wrt=params)


	update_methods = {
			'standard': [ (p, p - 0.001 * g) for p,g in zip(params,gradients) ],
#			'rmsprop' : updates.rmsprop(params,gradients),
#			'adadelta': updates.rmsprop(params,gradients),
		}
	P.save('init.pkl')
	for update_method in update_methods:
		print "Using update method:",update_method
		with open('train.%s.smart_init.log'%update_method,'w') as log:

			train = theano.function(
					inputs = [X],
					outputs = cost,
					updates = update_methods[update_method],
				)

			P.load('init.pkl')

			while True:
				cost_val = train(np.random.randint(0,8,size=20).astype(np.int32))
				log.write("%0.5f\n"%cost_val)
Beispiel #9
0
    while True:
        group_answers = data_io.group_answers(training_file)
        test_group_answers = islice(group_answers, test_instance_count)
        test_data = data_io.story_question_answer_idx(test_group_answers,
                                                      vocab_in)
        test_data = (x for x in test_data if x[1].shape[0] <= length_limit)
        tests = [
            np.array(test(input_data, idxs, question_data, ans_w, ans_evds),
                     dtype=np.float32)
            for input_data, idxs, question_data, ans_w, ans_evds in test_data
        ]
        errors = sum(tests) / len(tests)
        print "Error rate:", errors
        print "Starting epoch ", epoch
        if errors < best_error * 0.9:
            P.save('model.pkl')
            print "Wrote model."
            best_error = errors
            length_limit += 2
        else:
            #			learning_rate = learning_rate / 2
            #			batch_size = max(1,batch_size//2)
            #			print "Learning rate:",learning_rate
            P.save('tmp.model.pkl')
        buffer_size = 256 / batch_size

        train_group_answers = data_io.randomise(group_answers)
        training_data = data_io.story_question_answer_idx(
            train_group_answers, vocab_in)
        training_data = (x for x in training_data
                         if x[1].shape[0] <= length_limit)
        batched_stream = data_io.buffered_random(batched_stream, buffer_items=4)
        return batched_stream

    def validate():
        stream = data_io.stream_file('data/train.%02d.pklgz' % 0)
        stream = data_io.buffered_sort(stream, key=lambda x: x[1].shape[0], buffer_items=128)
        batched_stream = reader.batch_and_pad(stream, batch_size=32, mean=mean, std=std)

        total_cost = 0
        total_frames = 0
        for data, lengths in batched_stream:
            batch_avg_cost = test(data,lengths)
            batch_frames = np.sum(lengths)
            total_cost += batch_avg_cost * batch_frames
            total_frames += batch_frames
        return total_cost / total_frames

    import train_loop
    train_loop.run(
            data_iterator=stream,
            train_fun=lambda batch:train(batch[0],batch[1]),
            validation_score=validate,
            save_best_params=lambda:P.save('model.pkl'),
            load_best_params=lambda:P.load('model.pkl'),
            max_epochs=1000,
            patience=5000,
            patience_increase=2,
            improvement_threshold=0.999,
        )

        stream = data_io.stream_file('data/train.%02d.pklgz' % 0)
        stream = data_io.buffered_sort(stream,
                                       key=lambda x: x[1].shape[0],
                                       buffer_items=128)
        batched_stream = reader.batch_and_pad(stream,
                                              batch_size=32,
                                              mean=mean,
                                              std=std)

        total_cost = 0
        total_frames = 0
        for data, lengths in batched_stream:
            batch_avg_cost = test(data, lengths)
            batch_frames = np.sum(lengths)
            total_cost += batch_avg_cost * batch_frames
            total_frames += batch_frames
        return total_cost / total_frames

    import train_loop
    train_loop.run(
        data_iterator=stream,
        train_fun=lambda batch: train(batch[0], batch[1]),
        validation_score=validate,
        save_best_params=lambda: P.save('model.pkl'),
        load_best_params=lambda: P.load('model.pkl'),
        max_epochs=1000,
        patience=5000,
        patience_increase=2,
        improvement_threshold=0.999,
    )