class TestBidirectionalStack(unittest.TestCase): def setUp(self): prototype = SimpleRecurrent(dim=3, activation=Tanh()) self.layers = [ Bidirectional(weights_init=Orthogonal(), prototype=prototype) for _ in range(3) ] self.stack = RecurrentStack(self.layers) for fork in self.stack.forks: fork.weights_init = Identity(1) fork.biases_init = Constant(0) self.stack.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones( (24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0 def test_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_stack_layers = [ theano.function([x, mask], self.stack.apply(x, mask=mask)[i]) for i in range(len(self.layers)) ] stack_layers = [ f(self.x_val, self.mask_val) for f in calc_stack_layers ] h_val = self.x_val for stack_layer_value, bidir_net in zip(stack_layers, self.layers): calc = theano.function([x, mask], bidir_net.apply(x, mask=mask)) simple_layer_value = calc(h_val, self.mask_val) assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04) h_val = simple_layer_value[..., :3] def test_dims(self): self.assertEqual(self.stack.get_dim("inputs"), 3) for i in range(len(self.layers)): state_name = self.stack.suffix("states", i) self.assertEqual(self.stack.get_dim(state_name), 6)
class TestBidirectionalStack(unittest.TestCase): def setUp(self): prototype = SimpleRecurrent(dim=3, activation=Tanh()) self.layers = [ Bidirectional(weights_init=Orthogonal(), prototype=prototype) for _ in range(3)] self.stack = RecurrentStack(self.layers) for fork in self.stack.forks: fork.weights_init = Identity(1) fork.biases_init = Constant(0) self.stack.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=theano.config.floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) self.mask_val[12:24, 3] = 0 def test_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_stack_layers = [ theano.function([x, mask], self.stack.apply(x, mask=mask)[i]) for i in range(len(self.layers))] stack_layers = [ f(self.x_val, self.mask_val) for f in calc_stack_layers] h_val = self.x_val for stack_layer_value, bidir_net in zip(stack_layers, self.layers): calc = theano.function([x, mask], bidir_net.apply(x, mask=mask)) simple_layer_value = calc(h_val, self.mask_val) assert_allclose(stack_layer_value, simple_layer_value, rtol=1e-04) h_val = simple_layer_value[..., :3] def test_dims(self): self.assertEqual(self.stack.get_dim("inputs"), 3) for i in range(len(self.layers)): state_name = self.stack.suffix("states", i) self.assertEqual(self.stack.get_dim(state_name), 6)
def test_suffix(self): # level >= 0 !! level1, = numpy.random.randint(1, 150, size=(1, )) # name1 != "mask" !! name1 = "somepart" test_cases = [("mask", level1, "mask"), ("{name}", 0, "{name}"), ("{name}", level1, "{name}{sep}{level}")] for _name, level, _expected_result in test_cases: name = _name.format(name=name1, level=level1, sep=RECURRENTSTACK_SEPARATOR) expected_result = _expected_result.format( name=name1, level=level1, sep=RECURRENTSTACK_SEPARATOR) resut = RecurrentStack.suffix(name, level) assert resut == expected_result, "expected suffix(\"{}\",{}) -> \"{}\" got \"{}\"".format( name, level, expected_result, resut)
def test_suffix(self): # level >= 0 !! level1, = numpy.random.randint(1, 150, size=(1,)) # name1 != "mask" !! name1 = "somepart" test_cases = [ ("mask", level1, "mask"), ("{name}", 0, "{name}"), ("{name}", level1, "{name}{sep}{level}") ] for _name, level, _expected_result in test_cases: name = _name.format(name=name1, level=level1, sep=RECURRENTSTACK_SEPARATOR) expected_result = _expected_result.format(name=name1, level=level1, sep=RECURRENTSTACK_SEPARATOR) resut = RecurrentStack.suffix(name, level) assert resut == expected_result, "expected suffix(\"{}\",{}) -> \"{}\" got \"{}\"".format(name, level, expected_result, resut)
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout, depth, max_grad, step_method, epsilon, sample, skip, uniform, top): #---------------------------------------------------------------------- datasource = name def shnum(x): """ Convert a positive float into a short tag-usable string E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2 """ return '0' if x <= 0 else '%s%d' % (("%e"%x)[0], -np.floor(np.log10(x))) jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (datasource, depth, dim, mix_dim, int(dropout*10), shnum(learning_rate), batch_size, shnum(epsilon)) if max_length != 600: jobname += '-L%d'%max_length if GRU: jobname += 'g' if max_grad != 5.: jobname += 'G%g'%max_grad if step_method != 'adam': jobname += step_method if skip: jobname += 'D' assert depth > 1 if top: jobname += 'T' assert depth > 1 if uniform > 0.: jobname += 'u%d'%int(uniform*100) if debug: jobname += ".debug" if sample: print("Sampling") else: print("\nRunning experiment %s" % jobname) if old_model_name: print("starting from model %s"%old_model_name) #---------------------------------------------------------------------- transitions = [GatedRecurrent(dim=dim) if GRU else LSTM(dim=dim) for _ in range(depth)] if depth > 1: transition = RecurrentStack(transitions, name="transition", skip_connections=skip or top) if skip: source_names=[RecurrentStack.suffix('states', d) for d in range(depth)] else: source_names=[RecurrentStack.suffix('states', depth-1)] else: transition = transitions[0] transition.name = "transition" source_names=['states'] emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter") readout = Readout( readout_dim=emitter.get_dim('inputs'), source_names=source_names, emitter=emitter, name="readout") generator = SequenceGenerator(readout=readout, transition=transition) # Initialization settings if uniform > 0.: generator.weights_init = Uniform(width=uniform*2.) else: generator.weights_init = OrthogonalGlorot() generator.biases_init = Constant(0) # Build the cost computation graph [steps, batch_size, 3] x = T.tensor3('features', dtype=floatX) if debug: x.tag.test_value = np.ones((max_length,batch_size,3)).astype(floatX) x = x[:max_length,:,:] # has to be after setting test_value cost = generator.cost(x) cost.name = "sequence_log_likelihood" # Give an idea of what's going on model = Model(cost) params = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) model_size = 0 for v in params.itervalues(): s = v.get_value().shape model_size += s[0] * (s[1] if len(s) > 1 else 1) logger.info("Total number of parameters %d"%model_size) #------------------------------------------------------------ extensions = [] if old_model_name: if old_model_name == 'continue': old_model_name = jobname with open(old_model_name + '_model', "rb") as f: old_model = pickle.load(f) model.set_parameter_values(old_model.get_parameter_values()) del old_model else: # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() if sample: assert old_model_name and old_model_name != 'continue' Sample(generator, steps=max_length, path=old_model_name).do(None) exit(0) #------------------------------------------------------------ # Define the training algorithm. cg = ComputationGraph(cost) if dropout > 0.: from blocks.roles import INPUT, OUTPUT dropout_target = VariableFilter(roles=[OUTPUT], bricks=transitions, name_regex='states')(cg.variables) print('# dropout %d' % len(dropout_target)) cg = apply_dropout(cg, dropout_target, dropout) opt_cost = cg.outputs[0] else: opt_cost = cost if step_method == 'adam': step_rule = Adam(learning_rate) elif step_method == 'rmsprop': step_rule = RMSProp(learning_rate, decay_rate=0.95) elif step_method == 'adagrad': step_rule = AdaGrad(learning_rate) elif step_method == 'adadelta': step_rule = AdaDelta() elif step_method == 'scale': step_rule = Scale(learning_rate) else: raise Exception('Unknown sttep method %s'%step_method) step_rule = CompositeRule([StepClipping(max_grad), step_rule]) algorithm = GradientDescent( cost=opt_cost, parameters=cg.parameters, step_rule=step_rule) #------------------------------------------------------------ observables = [cost] # Fetch variables useful for debugging (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) min_energy = energies.min().copy(name="min_energy") max_energy = energies.max().copy(name="max_energy") observables += [min_energy, max_energy] # (activations,) = VariableFilter( # applications=[generator.transition.apply], # name=generator.transition.apply.states[0])(cg.variables) # mean_activation = named_copy(abs(activations).mean(), # "mean_activation") # observables.append(mean_activation) observables += [algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(param.norm(2).copy( name=name + "_norm")) observables.append(algorithm.gradients[param].norm(2).copy( name=name + "_grad_norm")) #------------------------------------------------------------ datasource_fname = os.path.join(fuel.config.data_path[0], datasource, datasource+'.hdf5') train_ds = H5PYDataset(datasource_fname, #max_length=max_length, which_sets=['train'], sources=('features',), load_in_memory=True) train_stream = DataStream(train_ds, iteration_scheme=ShuffledScheme( train_ds.num_examples, batch_size)) test_ds = H5PYDataset(datasource_fname, #max_length=max_length, which_sets=['test'], sources=('features',), load_in_memory=True) test_stream = DataStream(test_ds, iteration_scheme=SequentialScheme( test_ds.num_examples, batch_size)) train_stream = Mapping(train_stream, _transpose) test_stream = Mapping(test_stream, _transpose) def stream_stats(ds, label): itr = ds.get_epoch_iterator(as_dict=True) batch_count = 0 examples_count = 0 for batch in itr: batch_count += 1 examples_count += batch['features'].shape[1] print('%s #batch %d #examples %d' % (label, batch_count, examples_count)) stream_stats(train_stream, 'train') stream_stats(test_stream, 'test') extensions += [Timing(every_n_batches=10), TrainingDataMonitoring( observables, prefix="train", every_n_batches=10), DataStreamMonitoring( [cost], # without dropout test_stream, prefix="test", on_resumption=True, after_epoch=False, # by default this is True every_n_batches=100), # all monitored data is ready so print it... # (next steps may take more time and we want to see the # results as soon as possible so print as soon as you can) Printing(every_n_batches=10), # perform multiple dumps at different intervals # so if one of them breaks (has nan) we can hopefully # find a model from few batches ago in the other Checkpoint(jobname, before_training=False, after_epoch=True, save_separately=['log', 'model']), Sample(generator, steps=max_length, path=jobname+'.test', every_n_batches=100), ProgressBar(), FinishAfter(after_n_epochs=epochs) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), ] if bokeh: from blocks.extras.extensions.plot import Plot extensions.append(Plot( 'sketch', channels=[['cost']], every_n_batches=10)) # Construct the main loop and start training! main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions ) main_loop.run()
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout, depth, max_grad, step_method, epsilon, sample, skip, uniform, top): #---------------------------------------------------------------------- datasource = name def shnum(x): """ Convert a positive float into a short tag-usable string E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2 """ return '0' if x <= 0 else '%s%d' % (("%e"%x)[0], -np.floor(np.log10(x))) jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (datasource, depth, dim, mix_dim, int(dropout*10), shnum(learning_rate), batch_size, shnum(epsilon)) if max_length != 600: jobname += '-L%d'%max_length if GRU: jobname += 'g' if max_grad != 5.: jobname += 'G%g'%max_grad if step_method != 'adam': jobname += step_method if skip: jobname += 'D' assert depth > 1 if top: jobname += 'T' assert depth > 1 if uniform > 0.: jobname += 'u%d'%int(uniform*100) if debug: jobname += ".debug" if sample: print("Sampling") else: print("\nRunning experiment %s" % jobname) if old_model_name: print("starting from model %s"%old_model_name) #---------------------------------------------------------------------- transitions = [GatedRecurrent(dim=dim) if GRU else LSTM(dim=dim) for _ in range(depth)] if depth > 1: transition = RecurrentStack(transitions, name="transition", skip_connections=skip or top) if skip: source_names=[RecurrentStack.suffix('states', d) for d in range(depth)] else: source_names=[RecurrentStack.suffix('states', depth-1)] else: transition = transitions[0] transition.name = "transition" source_names=['states'] emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter") readout = Readout( readout_dim=emitter.get_dim('inputs'), source_names=source_names, emitter=emitter, name="readout") generator = SequenceGenerator(readout=readout, transition=transition) # Initialization settings if uniform > 0.: generator.weights_init = Uniform(width=uniform*2.) else: generator.weights_init = OrthogonalGlorot() generator.biases_init = Constant(0) # Build the cost computation graph [steps, batch_size, 3] x = T.tensor3('features', dtype=floatX) if debug: x.tag.test_value = np.ones((max_length,batch_size,3)).astype(floatX) x = x[:max_length,:,:] # has to be after setting test_value cost = generator.cost(x) cost.name = "sequence_log_likelihood" # Give an idea of what's going on model = Model(cost) params = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) model_size = 0 for v in params.itervalues(): s = v.get_value().shape model_size += s[0] * (s[1] if len(s) > 1 else 1) logger.info("Total number of parameters %d"%model_size) #------------------------------------------------------------ extensions = [] if old_model_name: if old_model_name == 'continue': old_model_name = jobname with open(old_model_name + '_model', "rb") as f: old_model = pickle.load(f) model.set_parameter_values(old_model.get_parameter_values()) del old_model else: # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() if sample: assert old_model_name and old_model_name != 'continue' Sample(generator, steps=max_length, path=old_model_name).do(None) exit(0) #------------------------------------------------------------ # Define the training algorithm. cg = ComputationGraph(cost) if dropout > 0.: from blocks.roles import INPUT, OUTPUT dropout_target = VariableFilter(roles=[OUTPUT], bricks=transitions, name_regex='states')(cg.variables) print('# dropout %d' % len(dropout_target)) cg = apply_dropout(cg, dropout_target, dropout) opt_cost = cg.outputs[0] else: opt_cost = cost if step_method == 'adam': step_rule = Adam(learning_rate) elif step_method == 'rmsprop': step_rule = RMSProp(learning_rate, decay_rate=0.95) elif step_method == 'adagrad': step_rule = AdaGrad(learning_rate) elif step_method == 'adadelta': step_rule = AdaDelta() elif step_method == 'scale': step_rule = Scale(learning_rate) else: raise Exception('Unknown sttep method %s'%step_method) step_rule = CompositeRule([StepClipping(max_grad), step_rule]) algorithm = GradientDescent( cost=opt_cost, parameters=cg.parameters, step_rule=step_rule) #------------------------------------------------------------ observables = [cost] # Fetch variables useful for debugging (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") observables += [min_energy, max_energy] # (activations,) = VariableFilter( # applications=[generator.transition.apply], # name=generator.transition.apply.states[0])(cg.variables) # mean_activation = named_copy(abs(activations).mean(), # "mean_activation") # observables.append(mean_activation) observables += [algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy( param.norm(2), name + "_norm")) observables.append(named_copy( algorithm.gradients[param].norm(2), name + "_grad_norm")) #------------------------------------------------------------ datasource_fname = os.path.join(fuel.config.data_path, datasource, datasource+'.hdf5') train_ds = H5PYDataset(datasource_fname, #max_length=max_length, which_sets=['train'], sources=('features',), load_in_memory=True) train_stream = DataStream(train_ds, iteration_scheme=ShuffledScheme( train_ds.num_examples, batch_size)) test_ds = H5PYDataset(datasource_fname, #max_length=max_length, which_sets=['test'], sources=('features',), load_in_memory=True) test_stream = DataStream(test_ds, iteration_scheme=SequentialScheme( test_ds.num_examples, batch_size)) train_stream = Mapping(train_stream, _transpose) test_stream = Mapping(test_stream, _transpose) def stream_stats(ds, label): itr = ds.get_epoch_iterator(as_dict=True) batch_count = 0 examples_count = 0 for batch in itr: batch_count += 1 examples_count += batch['features'].shape[1] print('%s #batch %d #examples %d' % (label, batch_count, examples_count)) stream_stats(train_stream, 'train') stream_stats(test_stream, 'test') extensions += [Timing(every_n_batches=10), TrainingDataMonitoring( observables, prefix="train", every_n_batches=10), DataStreamMonitoring( [cost], # without dropout test_stream, prefix="test", on_resumption=True, after_epoch=False, # by default this is True every_n_batches=100), # all monitored data is ready so print it... # (next steps may take more time and we want to see the # results as soon as possible so print as soon as you can) Printing(every_n_batches=10), # perform multiple dumps at different intervals # so if one of them breaks (has nan) we can hopefully # find a model from few batches ago in the other Checkpoint(jobname, before_training=False, after_epoch=True, save_separately=['log', 'model']), Sample(generator, steps=max_length, path=jobname+'.test', every_n_batches=100), ProgressBar(), FinishAfter(after_n_epochs=epochs) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), ] if bokeh: from blocks.extras.extensions.plot import Plot extensions.append(Plot( 'sketch', channels=[['cost']], every_n_batches=10)) # Construct the main loop and start training! main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions ) main_loop.run()