def test_strictness_2(self): stream = DataStream(IterableDataset([1, 2, 3, 4, 5, 6])) transformer = Batch(stream, ConstantScheme(2), strictness=2) assert_equal( list(transformer.get_epoch_iterator()), [(numpy.array([1, 2]),), (numpy.array([3, 4]),), (numpy.array([5, 6]),)], )
def test_batch(): stream = DataStream(IterableDataset([1, 2, 3, 4, 5])) wrapper = Batch(stream, ConstantScheme(2)) batches = list(wrapper.get_epoch_iterator()) expected = [(numpy.array([1, 2]),), (numpy.array([3, 4]),), (numpy.array([5]),)] assert len(batches) == len(expected) for b, e in zip(batches, expected): assert (b[0] == e[0]).all() # Check the `strict` flag def try_strict(strictness): return list(Batch(stream, ConstantScheme(2), strictness=strictness) .get_epoch_iterator()) assert_raises(ValueError, try_strict, 2) assert len(try_strict(1)) == 2 stream2 = DataStream(IterableDataset([1, 2, 3, 4, 5, 6])) assert len(list(Batch(stream2, ConstantScheme(2), strictness=2) .get_epoch_iterator())) == 3
def test_batch(): stream = DataStream(IterableDataset([1, 2, 3, 4, 5])) wrapper = Batch(stream, ConstantScheme(2)) batches = list(wrapper.get_epoch_iterator()) expected = [(numpy.array([1, 2]), ), (numpy.array([3, 4]), ), (numpy.array([5]), )] assert len(batches) == len(expected) for b, e in zip(batches, expected): assert (b[0] == e[0]).all() # Check the `strict` flag def try_strict(strictness): return list( Batch(stream, ConstantScheme(2), strictness=strictness).get_epoch_iterator()) assert_raises(ValueError, try_strict, 2) assert len(try_strict(1)) == 2 stream2 = DataStream(IterableDataset([1, 2, 3, 4, 5, 6])) assert len( list( Batch(stream2, ConstantScheme(2), strictness=2).get_epoch_iterator())) == 3
def test_strictness_2_error(self): stream = DataStream(IterableDataset([1, 2, 3, 4, 5])) transformer = Batch(stream, ConstantScheme(2), strictness=2) assert_raises(ValueError, list, transformer.get_epoch_iterator())
def test_strictness_2(self): stream = DataStream(IterableDataset([1, 2, 3, 4, 5, 6])) transformer = Batch(stream, ConstantScheme(2), strictness=2) assert_equal(list(transformer.get_epoch_iterator()), [(numpy.array([1, 2]),), (numpy.array([3, 4]),), (numpy.array([5, 6]),)])
from addition import AdditionTask from fuel.transformers import Mapping, Batch from fuel.schemes import ConstantScheme from numpy import swapaxes def _transpose(data): return tuple(swapaxes(array,0,1) for array in data if len(array.shape) > 2 ) dataset = AdditionTask(17) data_stream = dataset.get_example_stream() data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(14)) data_stream = Mapping(data_stream, _transpose) print next(data_stream.get_epoch_iterator())[0].shape
def train_model(batch_size=100, n_h=50, n_epochs=40): # Load the datasets with Fuel dictionary = pkl.load(open(DICT_FILE, 'r')) dictionary['~'] = len(dictionary) reverse_mapping = dict((j, i) for i, j in dictionary.items()) print("Loading the data") train = TextFile(files=[TRAIN_FILE], dictionary=dictionary, unk_token='~', level='character', preprocess=str.lower, bos_token=None, eos_token=None) train_stream = DataStream.default_stream(train) # organize data in batches and pad shorter sequences with zeros train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) # idem dito for the validation text val = TextFile(files=[VAL_FILE], dictionary=dictionary, unk_token='~', level='character', preprocess=str.lower, bos_token=None, eos_token=None) val_stream = DataStream.default_stream(val) # organize data in batches and pad shorter sequences with zeros val_stream = Batch(val_stream, iteration_scheme=ConstantScheme(batch_size)) val_stream = Padding(val_stream) print('Building model') # Set the random number generator' seeds for consistency rng = numpy.random.RandomState(12345) x = T.lmatrix('x') mask = T.matrix('mask') # Construct the LSTM layer recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h) logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1], n_in=n_h, n_out=111) cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x, x[1:], mask[1:]) / batch_size # create a list of all model parameters to be fit by gradient descent params = logreg_layer.params + recurrent_layer.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # update_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. learning_rate = 0.1 updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] update_model = theano.function([x, mask], cost, updates=updates) evaluate_model = theano.function([x, mask], cost) # Define and compile a function for generating a sequence step by step. x_t = T.iscalar() h_p = T.vector() c_p = T.vector() h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p) energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b energy_exp = T.exp(energy - T.max(energy, 1)[:, None]) output = energy_exp / energy_exp.sum(1)[:, None] single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t]) start_time = time.clock() iteration = 0 for epoch in range(n_epochs): print 'epoch:', epoch for x_, mask_ in train_stream.get_epoch_iterator(): iteration += 1 cross_entropy = update_model(x_.T, mask_.T) # Generate some text after each 20 minibatches if iteration % 40 == 0: try: prediction = numpy.ones(111, dtype=config.floatX) / 111.0 h_p = numpy.zeros((n_h,), dtype=config.floatX) c_p = numpy.zeros((n_h,), dtype=config.floatX) initial = 'the meaning of life is ' sentence = initial for char in initial: x_t = dictionary[char] prediction, h_p, c_p = single_step(x_t, h_p.flatten(), c_p.flatten()) sample = numpy.random.multinomial(1, prediction.flatten()) for i in range(450): x_t = numpy.argmax(sample) prediction, h_p, c_p = single_step(x_t, h_p.flatten(), c_p.flatten()) sentence += reverse_mapping[x_t] sample = numpy.random.multinomial(1, prediction.flatten()) print 'LSTM: "' + sentence + '"' except ValueError: print 'Something went wrong during sentence generation.' if iteration % 40 == 0: print 'epoch:', epoch, ' minibatch:', iteration val_scores = [] for x_val, mask_val in val_stream.get_epoch_iterator(): val_scores.append(evaluate_model(x_val.T, mask_val.T)) print 'Average validation CE per sentence:', numpy.mean(val_scores) end_time = time.clock() print('Optimization complete.') print('The code ran for %.2fm' % ((end_time - start_time) / 60.))
def main(mode, save_path, steps, num_batches, load_params): chars = (list(string.ascii_uppercase) + list(range(10)) + [' ', '.', ',', '\'', '"', '!', '?', '<UNK>']) char_to_ind = {char: i for i, char in enumerate(chars)} ind_to_char = {v: k for k, v in char_to_ind.iteritems()} train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'], char_to_ind, bos_token=None, eos_token=None, level='character') valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'], char_to_ind, bos_token=None, eos_token=None, level='character') vocab_size = len(char_to_ind) logger.info('Dictionary size: {}'.format(vocab_size)) if mode == 'continue': continue_training(save_path) return elif mode == "sample": main_loop = load(open(save_path, "rb")) generator = main_loop.model.get_top_bricks()[-1] sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] print("".join([ind_to_char[s] for s in outputs])) numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] return # Experiment configuration batch_size = 20 dim = 650 feedback_dim = 650 valid_stream = valid_dataset.get_example_stream() valid_stream = Batch(valid_stream, iteration_scheme=ConstantScheme(batch_size)) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=vocab_size, source_names=transition.apply.states, emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( vocab_size, feedback_dim, name='feedback'), name="readout"), transition, weights_init=Uniform(std=0.04), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() transition.push_initialization_config() generator.initialize() # Build the cost computation graph. features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') cost_matrix = generator.cost_matrix( features, mask=features_mask) batch_cost = cost_matrix.sum() cost = aggregation.mean( batch_cost, features.shape[1]) cost.name = "sequence_log_likelihood" char_cost = aggregation.mean( batch_cost, features_mask.sum()) char_cost.name = 'character_log_likelihood' ppl = 2 ** (cost / numpy.log(2)) ppl.name = 'ppl' bits_per_char = char_cost / tensor.log(2) bits_per_char.name = 'bits_per_char' length = features.shape[0] length.name = 'length' model = Model(batch_cost) if load_params: params = load_parameter_values(save_path) model.set_parameter_values(params) if mode == "train": # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_parameters().items()], width=120)) train_stream = train_dataset.get_example_stream() train_stream = Mapping(train_stream, _truncate) train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) train_stream = Mapping(train_stream, _transpose) parameters = model.get_parameter_dict() maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values()) algorithm = GradientDescent( cost=batch_cost, parameters=parameters.values(), step_rule=CompositeRule([StepClipping(1000.), AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects) ])) ft = features[:6, 0] ft.name = 'feature_example' observables = [cost, ppl, char_cost, length, bits_per_char] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) track_the_best_bpc = TrackTheBest('valid_bits_per_char') root_path, extension = os.path.splitext(save_path) this_step_monitoring = TrainingDataMonitoring( observables + [ft], prefix="this_step", after_batch=True) average_monitoring = TrainingDataMonitoring( observables + [algorithm.total_step_norm, algorithm.total_gradient_norm], prefix="average", every_n_batches=10) valid_monitoring = DataStreamMonitoring( observables, prefix="valid", every_n_batches=1500, before_training=False, data_stream=valid_stream) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, model=model, extensions=[ this_step_monitoring, average_monitoring, valid_monitoring, track_the_best_bpc, Checkpoint(save_path, ), Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_bpc.notification_name), (root_path + "_best" + extension,)), Timing(after_batch=True), Printing(every_n_batches=10), Plot(root_path, [[average_monitoring.record_name(cost), valid_monitoring.record_name(cost)], [average_monitoring.record_name(algorithm.total_step_norm)], [average_monitoring.record_name(algorithm.total_gradient_norm)], [average_monitoring.record_name(ppl), valid_monitoring.record_name(ppl)], [average_monitoring.record_name(char_cost), valid_monitoring.record_name(char_cost)], [average_monitoring.record_name(bits_per_char), valid_monitoring.record_name(bits_per_char)]], every_n_batches=10) ]) main_loop.run() elif mode == 'evaluate': with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f: raw_words = [line.split()[1:-1] for line in f.readlines()] words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] for w in raw_words] max_word_length = max([len(w) for w in words]) initial_states = tensor.matrix('init_states') cost_matrix_step = generator.cost_matrix(features, mask=features_mask, states=initial_states) cg = ComputationGraph(cost_matrix_step) states = cg.auxiliary_variables[-2] compute_cost = theano.function([features, features_mask, initial_states], [cost_matrix_step.sum(axis=0), states]) cost_matrix = generator.cost_matrix(features, mask=features_mask) initial_cg = ComputationGraph(cost_matrix) initial_states = initial_cg.auxiliary_variables[-2] total_word_cost = 0 num_words = 0 examples = numpy.zeros((max_word_length + 1, len(words)), dtype='int64') all_masks = numpy.zeros((max_word_length + 1, len(words)), dtype=floatX) for i, word in enumerate(words): examples[:len(word), i] = word all_masks[:len(word), i] = 1. single_space = numpy.array([char_to_ind[' ']])[:, None] for batch in valid_stream.get_epoch_iterator(): for example, mask in equizip(batch[0].T, batch[1].T): example = example[:(mask.sum())] spc_inds = list(numpy.where(example == char_to_ind[" "])[0]) state = generator.transition.transition.initial_states_.get_value()[None, :] for i, j in equizip([-1] + spc_inds, spc_inds + [-1]): word = example[(i+1):j, None] word_cost, states = compute_cost( word, numpy.ones_like(word, dtype=floatX), state) state = states[-1] costs = numpy.exp(-compute_cost( examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0]) _, space_states = compute_cost( single_space, numpy.ones_like(single_space, dtype=floatX), state) state = space_states[-1] word_prob = numpy.exp(-word_cost) total_word_cost += word_cost + numpy.log(numpy.sum(costs)) num_words += 1 print(word_prob) print(numpy.sum(costs)) print("Average cost", total_word_cost / num_words) print("PPL", numpy.exp(total_word_cost / num_words)) print("Word-level perplexity") print(total_word_cost / num_words) else: assert False
from fuel.transformers import Flatten from extensions.plot import Plot from datasets.addition import AdditionTask from numpy import swapaxes def _transpose(data): return tuple(swapaxes(array,0,1) if len(array.shape) > 2 else array for array in data) dataset = AdditionTask(1000) train_stream = dataset.get_example_stream() train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(10)) train_stream = Mapping(train_stream, _transpose) features_test, targets_test = next(train_stream.get_epoch_iterator()) x = tensor.tensor3('features') y = tensor.matrix('targets') n_batchs = 1000 h_dim = 2 x_dim = 2 encode = Linear(name='encode', input_dim=x_dim, output_dim=h_dim) gates = Linear(name = 'gates', input_dim = x_dim, output_dim = 2*h_dim)
def DStream(datatype, config): if datatype=='train': filename = config['train_file'] filename_morph = config['train_morph_file'] filename_rel = config['train_rel_file'] elif datatype == 'valid': filename = config['valid_file'] filename_morph = config['valid_morph_file'] filename_rel = config['valid_rel_file'] elif datatype == 'test': filename = config['test_file'] filename_morph = config['test_morph_file'] filename_rel = config['test_rel_file'] else: logger.error('wrong datatype, train, valid, or test') data = TextFile(files=[filename], dictionary=pickle.load(open(config['train_dic'],'rb')), unk_token=config['unk_token'], level='word', bos_token=config['bos_token'], eos_token=config['eos_token']) data_morph = TextFile(files=[filename_morph], dictionary=pickle.load(open(config['train_morph_dic'],'rb')), unk_token=config['unk_token'], level='word', bos_token=config['bos_token'], eos_token=config['eos_token']) data_stream = DataStream.default_stream(data) data_stream.sources = ('sentence',) data_morph_stream = DataStream.default_stream(data_morph) data_morph_stream.sources = ('sentence',) # organize data in batches and pad shorter sequences with zeros batch_size = config['batch_size'] rels_stream = [] with open(filename_rel , "r") as fin: lines = fin.readlines() i = 0 while i < len(lines): if i + batch_size < len(lines): rels_stream.append(padding(lines[i : i + batch_size])) i = i + batch_size else: rels_stream.append(padding(lines[i : len(lines)])) i = i + batch_size data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(batch_size)) data_stream = Padding(data_stream) data_morph_stream = Batch(data_morph_stream, iteration_scheme=ConstantScheme(batch_size)) data_morph_stream = Padding(data_morph_stream) data_morph_tensor3 = [] mask_morph_tensor3 = [] #data_morph_stream : batch_num * batch * sentence #rels_stream : batch_num * batch * sentence #dta_morph_tensor3 : batch_num * batch * sentence * morph for data_morph_tuple , rel in zip(data_morph_stream.get_epoch_iterator() , rels_stream): data_morph , mask_morph = data_morph_tuple #data_morph : batch * sentence #rel : batch * sentence tmp = [] tmp_mask = [] for m , mask , r in zip(data_morph , mask_morph , rel): #m : sentence #r : sentence start = 0 tmp2 = [] tmp_mask2 = [] for idx in r: tmp2.append(m[start:start+idx].tolist()) tmp_mask2.append(mask[start:start+idx].tolist()) #print m[start:start+idx] start = start + idx #print len(tmp) #print padding2(tmp2) tmp.append(tmp2) tmp_mask.append(tmp_mask2) #print len(tmp) , tmp #print m , r #print m.shape , r.shape #print padding2(tmp) data_morph_tensor3.append(np.array(padding2(tmp))) mask_morph_tensor3.append(np.array(padding2(tmp_mask) , dtype='float32')) return data_stream , data_morph_tensor3 , mask_morph_tensor3
from addition import AdditionTask from fuel.transformers import Mapping, Batch from fuel.schemes import ConstantScheme from numpy import swapaxes def _transpose(data): return tuple( swapaxes(array, 0, 1) for array in data if len(array.shape) > 2) dataset = AdditionTask(17) data_stream = dataset.get_example_stream() data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(14)) data_stream = Mapping(data_stream, _transpose) print next(data_stream.get_epoch_iterator())[0].shape
def DStream(datatype, config): if datatype in ['train','valid','test']: filename = config[datatype + '_file'] filename_morph = config[datatype + '_morph_file'] filename_rel = config[datatype + '_rel_file'] else: logger.error('wrong datatype, train, valid, or test') data_stream = getTextFile(filename, config['train_dic'], config) data_morph_stream = getTextFile(filename_morph, config['train_morph_dic'], config) # organize data in batches and pad shorter sequences with zeros batch_size = config['batch_size'] rels_stream = [] cnt = 0 with open(filename_rel , "r") as fin: lines = fin.readlines() i = 0 while i < len(lines): if i + batch_size < len(lines): rels_stream.append(padding(lines[i : i + batch_size])) i = i + batch_size else: rels_stream.append(padding(lines[i : len(lines)])) i = i + batch_size data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(batch_size)) data_stream = Padding(data_stream) data_morph_stream = Batch(data_morph_stream, iteration_scheme=ConstantScheme(batch_size)) data_morph_stream = Padding(data_morph_stream) data_morph_tensor3 = [] mask_morph_tensor3 = [] #data_morph_stream : batch_size * batch * sentence #rels_stream : batch_num * batch * sentence #data_morph_tensor3 : batch_num * batch * sentence * morph cnt = 0 for data_morph_tuple , rel in zip(data_morph_stream.get_epoch_iterator() , rels_stream): data_morph , mask_morph = data_morph_tuple #data_morph : batch * sentence #rel : batch * sentence tmp = [] tmp_mask = [] for m , mask , r in zip(data_morph , mask_morph , rel): start = 0 tmp2 = [] tmp_mask2 = [] for idx in r: tmp2.append(m[start:start+idx].tolist()) tmp_mask2.append(mask[start:start+idx].tolist()) #print m[start:start+idx] start = start + idx #print len(tmp) #print padding2(tmp2) tmp.append(tmp2) tmp_mask.append(tmp_mask2) #print len(tmp) , tmp #print m.shape , r.shape #print padding2(tmp) data_morph_tensor3.append(np.array(padding2(tmp))) mask_morph_tensor3.append(np.array(padding2(tmp_mask) , dtype='float32')) cnt += 1 ''' cnt = 0 for a, b, c in zip(data_stream.get_epoch_iterator() , mask_morph_tensor3, mask_morph_tensor3): data , mask = a if data.shape[1] != b.shape[1]: print data.shape , b.shape, c.shape cnt2 = 0 for i , d in enumerate(data): if cnt2 == 42: print i , len(d) , d dic2 = load_dic() for key in d: if key in dic2 and key != 0: print dic2[key], cnt2 += 1 print cnt #print data.shape , b[99] exit(0) print "###" cnt += 1 exit(0) ''' return data_stream , data_morph_tensor3 , mask_morph_tensor3
def train_model(batch_size=100, n_h=50, n_epochs=40): # Load the datasets with Fuel dictionary = pkl.load(open(DICT_FILE, 'r')) dictionary['~'] = len(dictionary) reverse_mapping = dict((j, i) for i, j in dictionary.items()) print("Loading the data") train = TextFile(files=[TRAIN_FILE], dictionary=dictionary, unk_token='~', level='character', preprocess=str.lower, bos_token=None, eos_token=None) train_stream = DataStream.default_stream(train) # organize data in batches and pad shorter sequences with zeros train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) # idem dito for the validation text val = TextFile(files=[VAL_FILE], dictionary=dictionary, unk_token='~', level='character', preprocess=str.lower, bos_token=None, eos_token=None) val_stream = DataStream.default_stream(val) # organize data in batches and pad shorter sequences with zeros val_stream = Batch(val_stream, iteration_scheme=ConstantScheme(batch_size)) val_stream = Padding(val_stream) print('Building model') # Set the random number generator' seeds for consistency rng = numpy.random.RandomState(12345) x = T.lmatrix('x') mask = T.matrix('mask') # Construct the LSTM layer recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h) logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1], n_in=n_h, n_out=111) cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x, x[1:], mask[1:]) / batch_size # create a list of all model parameters to be fit by gradient descent params = logreg_layer.params + recurrent_layer.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # update_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. learning_rate = 0.1 updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] update_model = theano.function([x, mask], cost, updates=updates) evaluate_model = theano.function([x, mask], cost) # Define and compile a function for generating a sequence step by step. x_t = T.iscalar() h_p = T.vector() c_p = T.vector() h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p) energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b energy_exp = T.exp(energy - T.max(energy, 1)[:, None]) output = energy_exp / energy_exp.sum(1)[:, None] single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t]) start_time = time.clock() iteration = 0 for epoch in range(n_epochs): print 'epoch:', epoch for x_, mask_ in train_stream.get_epoch_iterator(): iteration += 1 cross_entropy = update_model(x_.T, mask_.T) # Generate some text after each 20 minibatches if iteration % 40 == 0: try: prediction = numpy.ones(111, dtype=config.floatX) / 111.0 h_p = numpy.zeros((n_h, ), dtype=config.floatX) c_p = numpy.zeros((n_h, ), dtype=config.floatX) initial = 'the meaning of life is ' sentence = initial for char in initial: x_t = dictionary[char] prediction, h_p, c_p = single_step( x_t, h_p.flatten(), c_p.flatten()) sample = numpy.random.multinomial(1, prediction.flatten()) for i in range(450): x_t = numpy.argmax(sample) prediction, h_p, c_p = single_step( x_t, h_p.flatten(), c_p.flatten()) sentence += reverse_mapping[x_t] sample = numpy.random.multinomial( 1, prediction.flatten()) print 'LSTM: "' + sentence + '"' except ValueError: print 'Something went wrong during sentence generation.' if iteration % 40 == 0: print 'epoch:', epoch, ' minibatch:', iteration val_scores = [] for x_val, mask_val in val_stream.get_epoch_iterator(): val_scores.append(evaluate_model(x_val.T, mask_val.T)) print 'Average validation CE per sentence:', numpy.mean( val_scores) end_time = time.clock() print('Optimization complete.') print('The code ran for %.2fm' % ((end_time - start_time) / 60.))
from numpy import swapaxes def _transpose(data): return tuple( swapaxes(array, 0, 1) if len(array.shape) > 2 else array for array in data) dataset = AdditionTask(1000) train_stream = dataset.get_example_stream() train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(10)) train_stream = Mapping(train_stream, _transpose) features_test, targets_test = next(train_stream.get_epoch_iterator()) x = tensor.tensor3('features') y = tensor.matrix('targets') n_batchs = 1000 h_dim = 2 x_dim = 2 encode = Linear(name='encode', input_dim=x_dim, output_dim=h_dim) gates = Linear(name='gates', input_dim=x_dim, output_dim=2 * h_dim) #lstm = LSTM(activation=Tanh(), # dim=h_dim, name="lstm")
def train_model(seed = 12345,model='rnn',batch_size=50, n_h=50, n_epochs=40, updater='sgd', lr = 0.002, recThetaName='',outThetaName='',modelPath='',error_mark=40,fit_model=1,wght_sd=0.005,out_wght_sd=0.005, useLN=False, drop_p=0., grad_clip=0, patience=1, dyn_eval=0, funtype="identity", norm_max=-1.0): # load in feature dictionary dictionary = pkl.load(open(DICT_FILE, 'r')) #dictionary['~'] = len(dictionary) reverse_mapping = dict((j, i) for i, j in dictionary.items()) n_in = len(dictionary) # number inputs determined by size of lexicon print(" > Input.dim = ",n_in) # Load the datasets with Fuel print(" > Loading train data: ",TRAIN_FILE) train = TextFile(files=[TRAIN_FILE], dictionary=dictionary, unk_token=None, level='word', preprocess=None, bos_token=None, eos_token=None) train_stream = DataStream.default_stream(train) # get text-stream # organize data in batches and pad shorter sequences with zeros train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) # idem dito for the validation text print(" > Loading valid data: ",VAL_FILE) val = TextFile(files=[VAL_FILE], dictionary=dictionary, unk_token=None, level='word', preprocess=None, bos_token=None, eos_token=None) val_stream = DataStream.default_stream(val) # organize data in batches and pad shorter sequences with zeros val_stream = Batch(val_stream, iteration_scheme=ConstantScheme(batch_size)) # pad text-token sequences & user-id sequences val_stream = Padding(val_stream) print(' > Building model : ',model) # Set the random number generator' seeds for consistency numpy.random.seed(seed) random.seed(seed) rng = numpy.random.RandomState(seed) x = T.lmatrix('x') mask = T.matrix('mask') dmask = T.tensor3('dmask') # drop-out mask # Construct the LSTM layer if model == 'drnn': recurrent_layer = DeltaRNN.DeltaRNNLayer(rng=rng, input=x, mask=mask, dmask=dmask, n_in=n_in, n_h=n_h, sd=wght_sd,useLN=useLN,drop_p=drop_p,funtype=funtype,useFanInFanOut=useFanInFanOut,useOrtho=useOrtho,drop_inner=drop_inner) elif model == 'gru': recurrent_layer = GRU.GRULayer(rng=rng, input=x, mask=mask, dmask=dmask, n_in=n_in, n_h=n_h, sd=wght_sd,useLN=useLN,drop_p=drop_p,useFanInFanOut=useFanInFanOut,useOrtho=useOrtho) else: raise Exception(" Model not understood: ",model) if len(recThetaName) > 0: # Load in any pre-built parameters for recurrent layer print(" >> Loading old params for recurrent-layer: ",recThetaName) recurrent_layer.load(filename=recThetaName) print(" Using fan-in-fan-out for softmax weights? {0}".format(useFanInFanOut)) logreg_layer = MaxEnt.LogisticRegression(input=recurrent_layer.output[:-1], n_in=n_h, n_out=n_in, sd=out_wght_sd, useFanInFanOut=useFanInFanOut, useOrtho=useOrtho) if len(outThetaName) > 0: # Load in any pre-built parameters for output layer print(" >> Loading old params for output-layer: ",outThetaName) logreg_layer.load(filename=outThetaName) cost = Util.sequence_categorical_crossentropy(logreg_layer.p_y_given_x, x[1:], mask[1:]) / batch_size # create a list of all model parameters to be fit by gradient descent params = logreg_layer.params + recurrent_layer.params # create a list of gradients for all model parameters if grad_clip > 0.0: print(" >> Clipping grads of magnitude > ",grad_clip) #grads = T.grad(cost, params, disconnected_inputs='ignore') #grad_lst = [ T.sum( ( grad / float(batch_size) )**2 ) for grad in grads ] #grad_norm = T.sqrt( T.sum( grad_lst )) #all_grads = ifelse(T.gt(grad_norm, max_norm), # [grads*(max_norm / grad_norm) for grads in all_grads], # all_grads) grads = T.grad(theano.gradient.grad_clip(cost, -1 * grad_clip, grad_clip), params, disconnected_inputs='ignore') else: grads = T.grad(cost, params, disconnected_inputs='ignore') # set up update rules for model parameters print(" Clipping param norms to max of {0}".format(norm_max)) #learning_rate = lr learning_rate = T.scalar('learning_rate', dtype=theano.config.floatX) if updater == 'sgd': # use classical SGD updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] elif updater == 'adam': # use Adam adaptive learning rate update rule grads = OrderedDict(zip(params, grads)) updates = UpdateRule.Adam(grads, learning_rate, norm_max=norm_max) elif updater == 'rmsprop': # use RMSprop adaptive learning rate update rule grads = OrderedDict(zip(params, grads)) updates = UpdateRule.RMSprop(grads, learning_rate, norm_max=norm_max) else: raise Exception("Updater not understood: ",updater) update_model = theano.function([x, mask, dmask, learning_rate], cost, updates=updates) #, allow_input_downcast=True) evaluate_model = theano.function([x, mask, dmask], cost) #, allow_input_downcast=True) numParams = recurrent_layer.getNumParams() + logreg_layer.getNumParams() print(" -> Number Parameters = ",numParams) start_time = time.clock() iteration = 0 print(" Random.NLL = ",numpy.log(1. * n_in)) best_nll = -1.0 if fit_model == 1: # FIT MODEL TO THE DATA IF FLAG RAISED logfd = open(modelPath + "performance.csv", 'wb') writer = csv.writer(logfd) writer.writerow(["Epoch","AVG_NLL","BPC","AVG_PPL"]) # Get initial scores before any training val_scores = [] N = 0 nll = 0. for x_val, x_mask in val_stream.get_epoch_iterator(): # 3D tensor shape: variable dim x batch-size dim x time-window dim #d_mask = Utils.create_ones(n_h,x_mask.shape[0],x_mask.shape[1]) d_mask = Utils.create_zone_out_mask(rng, n_h, x_mask.shape[0],x_mask.shape[1], drop_p, sample=False) batch_score = evaluate_model(x_val.T, x_mask.T, d_mask.T) #batch_score = 0.0 nll += batch_score * x_mask.shape[0] val_scores.append(batch_score) N += numpy.sum(x_mask) nll = nll / N ce = numpy.mean(val_scores) ppl = numpy.exp(nll) print(' >> Epoch = {0} NLL = {1} log2(NLL) = {2} exp(NLL) = {3} '.format(-1,nll,(nll/math.log(2)),ppl)) writer.writerow([0,nll,(nll/math.log(2)),ppl]) logfd.flush() best_nll = nll best_epoch = -1 impatience = 0 l_r = lr # set initial learning rate for epoch in range(n_epochs): print('Epoch:', epoch) improve_flag = False for x_, x_mask_ in train_stream.get_epoch_iterator(): iteration += 1 #print("\r {0} mini-batches seen...".format(iteration),end='') #d_mask = Utils.create_ones(n_h,x_mask_.shape[0],x_mask_.shape[1]) d_mask = Utils.create_zone_out_mask(rng, n_h, x_mask_.shape[0],x_mask_.shape[1], drop_p) cross_entropy = update_model(x_.T, x_mask_.T, d_mask.T, l_r) #print("\r {0} mini-batches seen CE = {1}".format(iteration,cross_entropy),end='') #print(" {0} --> {1} mini-batches seen CE = {2}".format(epoch,iteration,cross_entropy)) if iteration % error_mark == 0: #print("") #print('epoch:', epoch, ' minibatch:', iteration) val_scores = [] N = 0 nll = 0. for x_val, x_mask in val_stream.get_epoch_iterator(): if x_val.size > 0: # as long as sample sequence is non-emtpy #d_mask = Utils.create_ones(n_h,x_mask.shape[0],x_mask.shape[1]) d_mask = Utils.create_zone_out_mask(rng, n_h, x_mask.shape[0],x_mask.shape[1], drop_p, sample=False) batch_score = evaluate_model(x_val.T, x_mask.T, d_mask.T) nll += batch_score * x_mask.shape[0] # un-normalize mini-batch scores val_scores.append(batch_score) N += numpy.sum(x_mask) nll = nll / N ce = numpy.mean(val_scores) ppl = numpy.exp(nll) writer.writerow([(epoch),nll,(nll/math.log(2)),ppl]) logfd.flush() if nll < best_nll: best_nll = nll best_epoch = epoch print(" >> Saving best model at epoch {0} with NLL = {1}".format(best_epoch, best_nll)) # Check-point save at end of epoch recSave = "{0}rec-params-best-{1}".format(modelPath,epoch) recurrent_layer.save(recSave) outSave = "{0}out-params-best-{1}".format(modelPath,epoch) logreg_layer.save(outSave) # Save best params so far recSave = "{0}rec-params-best".format(modelPath,epoch) recurrent_layer.save(recSave) outSave = "{0}out-params-best".format(modelPath,epoch) logreg_layer.save(outSave) improve_flag = True # raise improvement flag since improvement was observed print(' >> Epoch = {0} Avg.NLL = {1} (Best = {5}) Avg.BPC = {2} PPL = {3} Iter = {4}'.format(epoch,nll,(nll/math.log(2)),ppl,iteration,best_nll)) # adapt the learning rate based on patience schedule if improve_flag is False: if patience > 0: # we only consider positive/non-zero patience values (otherwise, turn this option off) impatience += 1 if impatience >= patience: l_r = (numpy.maximum(1e-4,l_r * l_r_decay)).astype(theano.config.floatX) print(" __Decreasing learning rate to ",l_r) impatience = 0 improve_flag = False # Evaluate generalization at end of epoch if iteration % error_mark != 0: # this if-stmt avoids redundant evaluation computation #print("") val_scores = [] N = 0 nll = 0. for x_val, x_mask in val_stream.get_epoch_iterator(): if x_val.size > 0: # as long as sample sequence is non-emtpy #d_mask = Utils.create_ones(n_h,x_mask.shape[0],x_mask.shape[1]) d_mask = Utils.create_zone_out_mask(rng, n_h, x_mask.shape[0],x_mask.shape[1], drop_p, sample=False) batch_score = evaluate_model(x_val.T, x_mask.T, d_mask.T) nll += batch_score * x_mask.shape[0] val_scores.append(batch_score) N += numpy.sum(x_mask) nll = nll / N ce = numpy.mean(val_scores) ppl = numpy.exp(nll) writer.writerow([(epoch+1),nll,(nll/math.log(2)),ppl]) logfd.flush() # Check-point recSave = "{0}rec-params-end-{1}".format(modelPath,epoch) recurrent_layer.save(recSave) outSave = "{0}out-params-end-{1}".format(modelPath,epoch) logreg_layer.save(outSave) if nll < best_nll: best_nll = nll best_epoch = epoch print(" >> Saving best model at epoch {0} with NLL = {1}".format(best_epoch, best_nll)) # Check-point save at end of epoch recSave = "{0}rec-params-best-{1}".format(modelPath,epoch) recurrent_layer.save(recSave) outSave = "{0}out-params-best-{1}".format(modelPath,epoch) logreg_layer.save(outSave) # Save best params so far recSave = "{0}rec-params-best".format(modelPath,epoch) recurrent_layer.save(recSave) outSave = "{0}out-params-best".format(modelPath,epoch) logreg_layer.save(outSave) improve_flag = True print(' >> Epoch = {0} Avg.NLL = {1} (Best = {4}) Avg.BPC = {2} PPL = {3} '.format(epoch,nll,(nll/math.log(2)),ppl,best_nll)) # adapt the learning rate based on patience schedule ''' if improve_flag is False: if patience > 0: impatience += 1 if impatience >= patience: l_r = (numpy.maximum(0.00001,l_r / 2.0)).astype(theano.config.floatX) print(" __Decreasing learning rate to ",l_r) impatience = 0 ''' print("") print(' > Optimization complete.') print(' >>>> Best NLL = {0} at Epoch {1}'.format(best_nll,best_epoch)) end_time = time.clock() print(' > The code ran for %.2fm' % ((end_time - start_time) / 60.)) print('---------------------------------------') logfd.close() else: print(' > Skipping model fit directly to evaluation...') print(' > dynamic eval code := ',dyn_eval) # EVALUATION-ONLY print(' > FINAL.VALID := ',VAL_FILE) val_scores = [] N = 0 nll = 0. l_r = lr # set initial learning rate for x_val, x_mask in val_stream.get_epoch_iterator(): #print(' type = ',type(x_val)) #x_val = x_val.astype(numpy.int64) #print(x_val) #print(' = ',(x_val).size) if x_val.size > 0: # as long as sample sequence is non-emtpy if dyn_eval > 0: d_mask = Utils.create_zone_out_mask(rng, n_h, x_mask.shape[0],x_mask.shape[1], drop_p) batch_score = update_model(x_val.T, x_mask.T, d_mask.T, l_r) else: #d_mask = Utils.create_ones(n_h,x_mask.shape[0],x_mask.shape[1]) d_mask = Utils.create_zone_out_mask(rng, n_h, x_mask.shape[0],x_mask.shape[1], drop_p, sample=False) batch_score = evaluate_model(x_val.T, x_mask.T, d_mask.T) #print(" B{0} vs X{1} N{2}".format(batch_size,x_mask.shape[0],numpy.sum(x_mask))) nll += batch_score * batch_size #* x_mask.shape[0] # un-normalize mini-batch scores val_scores.append(batch_score) N += numpy.sum(x_mask) print('\r NLL.tmp = {0} over {1}'.format((nll / N),N),end='') print('') nll = nll / N ce = numpy.mean(val_scores) ppl = numpy.exp(nll) print(' > FINAL.VALID: Avg.NLL = {0} Avg.BPC = {1} PPL = {2} N.tokens = {3}'.format(nll,(nll/math.log(2)),ppl,N)) if dyn_eval > 1: dyn_eval = 0 # Evaluate model on training as well (as measure of overfitting) print(' > FINAL.TRAIN := ',TRAIN_FILE) val_scores = [] N = 0 nll = 0. for x_val, x_mask in train_stream.get_epoch_iterator(): if x_val.size > 0: # as long as sample sequence is non-emtpy if dyn_eval > 0: d_mask = Utils.create_zone_out_mask(rng, n_h, x_mask.shape[0],x_mask.shape[1], drop_p) batch_score = update_model(x_val.T, x_mask.T, d_mask.T, l_r) else: #d_mask = Utils.create_ones(n_h,x_mask.shape[0],x_mask.shape[1]) d_mask = Utils.create_zone_out_mask(rng, n_h, x_mask.shape[0],x_mask.shape[1], drop_p, sample=False) batch_score = evaluate_model(x_val.T, x_mask.T, d_mask.T) nll += batch_score * batch_size # * x_mask.shape[0] # un-normalize mini-batch scores val_scores.append(batch_score) N += numpy.sum(x_mask) print('\r NLL.tmp = {0} over {1}'.format((nll / N),N),end='') print('') nll = nll / N ce = numpy.mean(val_scores) ppl = numpy.exp(nll) print(' > FINAL.TRAIN: Avg.NLL = {0} Avg.BPC = {1} PPL = {2} N.tokens = {3}'.format(nll,(nll/math.log(2)),ppl,N)) end_time = time.clock() print(' > Final Evaluation complete.') print(' > The code ran for %.2fm' % ((end_time - start_time) / 60.)) print('---------------------------------------') recSave = modelPath + "rec-params" print(' > Saving model.recurrent params to disk: ',recSave) recurrent_layer.save(recSave) outSave = modelPath + "out-params" print(' > Saving model.output params to disk: ',outSave) logreg_layer.save(outSave)