def test_model(): x = tensor.matrix('x') mlp1 = MLP([Tanh(), Tanh()], [10, 20, 30], name="mlp1") mlp2 = MLP([Tanh()], [30, 40], name="mlp2") h1 = mlp1.apply(x) h2 = mlp2.apply(h1) model = Model(h2) assert model.get_top_bricks() == [mlp1, mlp2] # The order of parameters returned is deterministic but # not sensible. assert list(model.get_parameter_dict().items()) == [ ('/mlp2/linear_0.b', mlp2.linear_transformations[0].b), ('/mlp1/linear_1.b', mlp1.linear_transformations[1].b), ('/mlp1/linear_0.b', mlp1.linear_transformations[0].b), ('/mlp1/linear_0.W', mlp1.linear_transformations[0].W), ('/mlp1/linear_1.W', mlp1.linear_transformations[1].W), ('/mlp2/linear_0.W', mlp2.linear_transformations[0].W) ] # Test getting and setting parameter values mlp3 = MLP([Tanh()], [10, 10]) mlp3.allocate() model3 = Model(mlp3.apply(x)) parameter_values = { '/mlp/linear_0.W': 2 * numpy.ones( (10, 10), dtype=theano.config.floatX), '/mlp/linear_0.b': 3 * numpy.ones(10, dtype=theano.config.floatX) } model3.set_parameter_values(parameter_values) assert numpy.all( mlp3.linear_transformations[0].parameters[0].get_value() == 2) assert numpy.all( mlp3.linear_transformations[0].parameters[1].get_value() == 3) got_parameter_values = model3.get_parameter_values() assert len(got_parameter_values) == len(parameter_values) for name, value in parameter_values.items(): assert_allclose(value, got_parameter_values[name]) # Test exception is raised if parameter shapes don't match def helper(): parameter_values = { '/mlp/linear_0.W': 2 * numpy.ones( (11, 11), dtype=theano.config.floatX), '/mlp/linear_0.b': 3 * numpy.ones(11, dtype=theano.config.floatX) } model3.set_parameter_values(parameter_values) assert_raises(ValueError, helper) # Test name conflict handling mlp4 = MLP([Tanh()], [10, 10]) def helper(): Model(mlp4.apply(mlp3.apply(x))) assert_raises(ValueError, helper)
def main(): import configurations from stream import DStream logger = logging.getLogger(__name__) cfig = getattr(configurations, 'get_config_penn')() rnnlm = Rnnlm(cfig['vocabsize'], cfig['nemb'], cfig['nhids']) rnnlm.weights_init = IsotropicGaussian(0.1) rnnlm.biases_init = Constant(0.) rnnlm.push_initialization_config() rnnlm.generator.transition.weights_init = Orthogonal() sentence = tensor.lmatrix('sentence') sentence_mask = tensor.matrix('sentence_mask') batch_cost = rnnlm.cost(sentence, sentence_mask).sum() batch_size = sentence.shape[1].copy(name='batch_size') cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in parameters.items()], width=120)) for brick in model.get_top_bricks(): brick.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [train_monitor, Timing(), Printing(after_batch=True), FinishAfter(after_n_epochs=1000), Printing(every_n_batches=1)] train_stream = DStream(datatype='train', config=cfig) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def test_model(): x = tensor.matrix('x') mlp1 = MLP([Tanh(), Tanh()], [10, 20, 30], name="mlp1") mlp2 = MLP([Tanh()], [30, 40], name="mlp2") h1 = mlp1.apply(x) h2 = mlp2.apply(h1) model = Model(h2) assert model.get_top_bricks() == [mlp1, mlp2] # The order of parameters returned is deterministic but # not sensible. assert list(model.get_parameter_dict().items()) == [ ('/mlp2/linear_0.b', mlp2.linear_transformations[0].b), ('/mlp1/linear_1.b', mlp1.linear_transformations[1].b), ('/mlp1/linear_0.b', mlp1.linear_transformations[0].b), ('/mlp1/linear_0.W', mlp1.linear_transformations[0].W), ('/mlp1/linear_1.W', mlp1.linear_transformations[1].W), ('/mlp2/linear_0.W', mlp2.linear_transformations[0].W)] # Test getting and setting parameter values mlp3 = MLP([Tanh()], [10, 10]) mlp3.allocate() model3 = Model(mlp3.apply(x)) parameter_values = { '/mlp/linear_0.W': 2 * numpy.ones((10, 10), dtype=theano.config.floatX), '/mlp/linear_0.b': 3 * numpy.ones(10, dtype=theano.config.floatX)} model3.set_parameter_values(parameter_values) assert numpy.all( mlp3.linear_transformations[0].parameters[0].get_value() == 2) assert numpy.all( mlp3.linear_transformations[0].parameters[1].get_value() == 3) got_parameter_values = model3.get_parameter_values() assert len(got_parameter_values) == len(parameter_values) for name, value in parameter_values.items(): assert_allclose(value, got_parameter_values[name]) # Test exception is raised if parameter shapes don't match def helper(): parameter_values = { '/mlp/linear_0.W': 2 * numpy.ones((11, 11), dtype=theano.config.floatX), '/mlp/linear_0.b': 3 * numpy.ones(11, dtype=theano.config.floatX)} model3.set_parameter_values(parameter_values) assert_raises(ValueError, helper) # Test name conflict handling mlp4 = MLP([Tanh()], [10, 10]) def helper(): Model(mlp4.apply(mlp3.apply(x))) assert_raises(ValueError, helper)
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = chars.shape[1].copy(name="batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations,) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = chars.shape[0].copy(name="max_length") cost_per_character = aggregation.mean( batch_cost, batch_size * max_length).copy( name="character_log_likelihood") min_energy = energies.min().copy(name="min_energy") max_energy = energies.max().copy(name="max_energy") mean_activation = abs(activations).mean().copy( name="mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, parameter in parameters.items(): observables.append(parameter.norm(2).copy(name + "_norm")) observables.append(algorithm.gradients[parameter].norm(2).copy( name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_parameter_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( applications=[reverser.generator.generate], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: try: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) except EOFError: break except Exception: traceback.print_exc() continue encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout, depth, max_grad, step_method, epsilon, sample, skip, uniform, top): #---------------------------------------------------------------------- datasource = name def shnum(x): """ Convert a positive float into a short tag-usable string E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2 """ return '0' if x <= 0 else '%s%d' % (("%e"%x)[0], -np.floor(np.log10(x))) jobname = "%s-%dX%dm%dd%dr%sb%de%s" % (datasource, depth, dim, mix_dim, int(dropout*10), shnum(learning_rate), batch_size, shnum(epsilon)) if max_length != 600: jobname += '-L%d'%max_length if GRU: jobname += 'g' if max_grad != 5.: jobname += 'G%g'%max_grad if step_method != 'adam': jobname += step_method if skip: jobname += 'D' assert depth > 1 if top: jobname += 'T' assert depth > 1 if uniform > 0.: jobname += 'u%d'%int(uniform*100) if debug: jobname += ".debug" if sample: print("Sampling") else: print("\nRunning experiment %s" % jobname) if old_model_name: print("starting from model %s"%old_model_name) #---------------------------------------------------------------------- transitions = [GatedRecurrent(dim=dim) if GRU else LSTM(dim=dim) for _ in range(depth)] if depth > 1: transition = RecurrentStack(transitions, name="transition", fast=True, skip_connections=skip or top) if skip: source_names=['states'] + ['states_%d'%d for d in range(1,depth)] else: source_names=['states_%d'%(depth-1)] else: transition = transitions[0] transition.name = "transition" source_names=['states'] emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter") readout = Readout( readout_dim=emitter.get_dim('inputs'), source_names=source_names, emitter=emitter, name="readout") normal_inputs = [name for name in transition.apply.sequences if 'mask' not in name] fork = Fork(normal_inputs, prototype=Linear(use_bias=True)) generator = SequenceGenerator(readout=readout, transition=transition, fork=fork) # Initialization settings if uniform > 0.: generator.weights_init = Uniform(width=uniform*2.) else: generator.weights_init = OrthogonalGlorot() generator.biases_init = Constant(0) # Build the cost computation graph [steps, batch_size, 3] x = T.tensor3('features', dtype=floatX) if debug: x.tag.test_value = np.ones((max_length,batch_size,3)).astype(floatX) x = x[:max_length,:,:] # has to be after setting test_value cost = generator.cost(x) cost.name = "sequence_log_likelihood" # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) model_size = 0 for v in params.itervalues(): s = v.get_value().shape model_size += s[0] * (s[1] if len(s) > 1 else 1) logger.info("Total number of parameters %d"%model_size) #------------------------------------------------------------ extensions = [] if old_model_name == 'continue': extensions.append(LoadFromDump(jobname)) elif old_model_name: # or you can just load the weights without state using: old_params = LoadFromDump(old_model_name).manager.load_parameters() model.set_param_values(old_params) else: # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() if sample: assert old_model_name and old_model_name != 'continue' Sample(generator, steps=max_length, path=old_model_name).do(None) exit(0) #------------------------------------------------------------ # Define the training algorithm. cg = ComputationGraph(cost) if dropout > 0.: from blocks.roles import INPUT, OUTPUT dropout_target = VariableFilter(roles=[OUTPUT], bricks=transitions, name_regex='states')(cg.variables) print('# dropout %d' % len(dropout_target)) cg = apply_dropout(cg, dropout_target, dropout) opt_cost = cg.outputs[0] else: opt_cost = cost if step_method == 'adam': step_rule = Adam(learning_rate) elif step_method == 'rmsprop': step_rule = RMSProp(learning_rate, decay_rate=0.95) elif step_method == 'adagrad': step_rule = AdaGrad(learning_rate) elif step_method == 'adadelta': step_rule = AdaDelta() elif step_method == 'scale': step_rule = Scale(learning_rate) else: raise Exception('Unknown sttep method %s'%step_method) step_rule = CompositeRule([StepClipping(max_grad), step_rule]) algorithm = GradientDescent( cost=opt_cost, params=cg.parameters, step_rule=step_rule) #------------------------------------------------------------ observables = [cost] # Fetch variables useful for debugging (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") observables += [min_energy, max_energy] # (activations,) = VariableFilter( # applications=[generator.transition.apply], # name=generator.transition.apply.states[0])(cg.variables) # mean_activation = named_copy(abs(activations).mean(), # "mean_activation") # observables.append(mean_activation) observables += [algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy( param.norm(2), name + "_norm")) observables.append(named_copy( algorithm.gradients[param].norm(2), name + "_grad_norm")) #------------------------------------------------------------ datasource_fname = os.path.join(fuel.config.data_path, datasource, datasource+'.hdf5') train_ds = H5PYDataset(datasource_fname, #max_length=max_length, which_set='train', sources=('features',), load_in_memory=True) train_stream = DataStream(train_ds, iteration_scheme=ShuffledScheme( train_ds.num_examples, batch_size)) test_ds = H5PYDataset(datasource_fname, #max_length=max_length, which_set='test', sources=('features',), load_in_memory=True) test_stream = DataStream(test_ds, iteration_scheme=SequentialScheme( test_ds.num_examples, batch_size)) train_stream = Mapping(train_stream, _transpose) test_stream = Mapping(test_stream, _transpose) def stream_stats(ds, label): itr = ds.get_epoch_iterator(as_dict=True) batch_count = 0 examples_count = 0 for batch in itr: batch_count += 1 examples_count += batch['features'].shape[1] print('%s #batch %d #examples %d' % (label, batch_count, examples_count)) stream_stats(train_stream, 'train') stream_stats(test_stream, 'test') extensions += [Timing(every_n_batches=10), TrainingDataMonitoring( observables, prefix="train", every_n_batches=10), DataStreamMonitoring( [cost], # without dropout test_stream, prefix="test", on_resumption=True, after_epoch=False, # by default this is True every_n_batches=100), # all monitored data is ready so print it... # (next steps may take more time and we want to see the # results as soon as possible so print as soon as you can) Printing(every_n_batches=10), # perform multiple dumps at different intervals # so if one of them breaks (has nan) we can hopefully # find a model from few batches ago in the other Dump(jobname, every_n_batches=11), Dump(jobname+'.test', every_n_batches=100), Sample(generator, steps=max_length, path=jobname+'.test', every_n_batches=100), ProgressBar(), FinishAfter(after_n_epochs=epochs) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), ] if bokeh: from blocks.extensions.plot import Plot extensions.append(Plot( 'sketch', channels=[ ['cost'],])) # Construct the main loop and start training! main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions ) main_loop.run()
# Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = m.cost(chars, chars_mask, targets, targets_mask).sum() batch_size = chars.shape[1].copy(name="batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" print("Cost graph is built", file=sys.stderr) model = Model(cost) parameters = model.get_parameter_dict() for brick in model.get_top_bricks(): #{ brick.initialize() #} cg = ComputationGraph(cost) algo = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule( [StepClipping(10.0), Scale(0.01)])) #algo = RMSProp(learning_rate=1.0, decay_rate=0.9) max_length = chars.shape[0].copy(name="max_length") observables = [ batch_size, max_length, algo.total_step_norm, algo.total_gradient_norm,
vocabs = pkl.load(f) word_vocab, rel_vocab = vocabs['word'], vocabs['rel'] with open('dataset/trainXY.json') as f: train = json.load(f) train = wrap_stream(train) with open('dataset/testXY.json') as f: test = json.load(f) test = wrap_stream(test) model = LSTMModel(len(vocabs['word']), n_mem, len(vocabs['rel'])) cg = ComputationGraph(model.cost) bricks_model = Model(model.cost) for brick in bricks_model.get_top_bricks(): brick.initialize() model.lookup.W.set_value(vocabs['word'].get_embeddings().astype(theano.config.floatX)) if dropout: pass # logger.info('Applying dropout of {}'.format(dropout)) # lstm_dropout = [v for v in cg.intermediary_variables if v.name in {'W_cell_to_in', 'W_cell_to_out'}] # cg = apply_dropout(cg, lstm_dropout, drop_prob=dropout) # summary of what's going on parameters = bricks_model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape, value.get_value().mean()) for key, value in parameters.items()],
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weghts_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( application=generator.readout.readout, name="output")(cg.variables) (activations,) = VariableFilter( application=generator.transition.apply, name=generator.transition.apply.states[0])(cg.variables) max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy(abs(activations).mean(), "mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy( param.norm(2), name + "_norm")) observables.append(named_copy( algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_param_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( bricks=[reverser.generator], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(input_.shape[1], samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout, depth, max_grad, step_method, epsilon, sample): #---------------------------------------------------------------------- datasource = name def shnum(x): """ Convert a positive float into a short tag-usable string E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2 """ return '0' if x <= 0 else '%s%d' % ( ("%e" % x)[0], -np.floor(np.log10(x))) jobname = "%s-%dX%dm%dd%dr%sb%de%s" % ( datasource, depth, dim, mix_dim, int( dropout * 10), shnum(learning_rate), batch_size, shnum(epsilon)) if max_length != 600: jobname += '-L%d' % max_length if GRU: jobname += 'g' if max_grad != 5.: jobname += 'G%g' % max_grad if step_method != 'adam': jobname += step_method if sample: print("Sampling") else: print("\nRunning experiment %s" % jobname) #---------------------------------------------------------------------- if depth > 1: transition = LSTMstack(dim=dim, depth=depth, name="transition", lstm_name="transition") assert not GRU elif GRU: transition = GatedRecurrent(dim=dim, name="transition") else: transition = LSTM(dim=dim, name="transition") emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter") readout = Readout(readout_dim=emitter.get_dim('inputs'), source_names=['states'], emitter=emitter, name="readout") normal_inputs = [ name for name in transition.apply.sequences if 'mask' not in name ] fork = Fork(normal_inputs, prototype=Linear(use_bias=True)) generator = SequenceGenerator(readout=readout, transition=transition, fork=fork) # Initialization settings generator.weights_init = OrthogonalGlorot() generator.biases_init = Constant(0) # Build the cost computation graph [steps,batch_size, 3] x = T.tensor3('features', dtype=floatX)[:max_length, :, :] x.tag.test_value = np.ones((max_length, batch_size, 3)).astype(np.float32) cost = generator.cost(x) cost.name = "sequence_log_likelihood" # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) model_size = 0 for v in params.itervalues(): s = v.get_value().shape model_size += s[0] * (s[1] if len(s) > 1 else 1) logger.info("Total number of parameters %d" % model_size) #------------------------------------------------------------ extensions = [] if old_model_name == 'continue': extensions.append(LoadFromDump(jobname)) elif old_model_name: # or you can just load the weights without state using: old_params = LoadFromDump(old_model_name).manager.load_parameters() model.set_param_values(old_params) else: # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() if sample: assert old_model_name and old_model_name != 'continue' Sample(generator, steps=max_length, path='.').do(None) exit(0) #------------------------------------------------------------ # Define the training algorithm. cg = ComputationGraph(cost) if dropout > 0.: from blocks.roles import INPUT, OUTPUT dropout_target = VariableFilter(roles=[OUTPUT], bricks=[transition], name_regex='states')(cg.variables) cg = apply_dropout(cg, dropout_target, dropout) cost = cg.outputs[0] if step_method == 'adam': step_rule = Adam(learning_rate) elif step_method == 'rmsprop': step_rule = RMSProp(learning_rate, decay_rate=0.95) elif step_method == 'adagrad': step_rule = AdaGrad(learning_rate) elif step_method == 'adadelta': step_rule = AdaDelta() elif step_method == 'scale': step_rule = Scale(learning_rate=0.1) else: raise Exception('Unknown sttep method %s' % step_method) step_rule = CompositeRule([StepClipping(max_grad), step_rule]) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) #------------------------------------------------------------ observables = [cost] # Fetch variables useful for debugging (energies, ) = VariableFilter(applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations, ) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy(abs(activations).mean(), "mean_activation") observables += [min_energy, max_energy, mean_activation] observables += [algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) #------------------------------------------------------------ datasource_fname = os.path.join(fuel.config.data_path, datasource, datasource + '.hdf5') train_ds = H5PYDataset( datasource_fname, #max_length=max_length, which_set='train', sources=('features', ), load_in_memory=True) train_stream = DataStream(train_ds, iteration_scheme=ShuffledScheme( train_ds.num_examples, batch_size)) test_ds = H5PYDataset( datasource_fname, #max_length=max_length, which_set='test', sources=('features', ), load_in_memory=True) test_stream = DataStream(test_ds, iteration_scheme=SequentialScheme( test_ds.num_examples, batch_size)) train_stream = Mapping(train_stream, _transpose) test_stream = Mapping(test_stream, _transpose) def stream_stats(ds, label): itr = ds.get_epoch_iterator(as_dict=True) batch_count = 0 examples_count = 0 for batch in itr: batch_count += 1 examples_count += batch['features'].shape[1] print('%s #batch %d #examples %d' % (label, batch_count, examples_count)) stream_stats(train_stream, 'train') stream_stats(test_stream, 'test') extensions += [ Timing(every_n_batches=10), TrainingDataMonitoring(observables, prefix="train", every_n_batches=10), DataStreamMonitoring( [cost], test_stream, prefix="test", on_resumption=True, after_epoch=False, # by default this is True every_n_batches=100), # all monitored data is ready so print it... # (next steps may take more time and we want to see the # results as soon as possible so print as soon as you can) Printing(every_n_batches=10), # perform multiple dumps at different intervals # so if one of them breaks (has nan) we can hopefully # find a model from few batches ago in the other Dump(jobname, every_n_batches=11), Dump(jobname + '.test', every_n_batches=100), Sample(generator, steps=max_length, path=jobname + '.test', every_n_batches=100), ProgressBar(), FinishAfter(after_n_epochs=epochs) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), ] if bokeh: extensions.append(Plot('sketch', channels=[ ['cost'], ])) # Construct the main loop and start training! main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(mode, save_path, num_batches, data_path=None): # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Build bricks encoder = Bidirectional(SimpleRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) fork = Fork( [name for name in encoder.prototype.apply.sequences if name != 'mask'], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.output_dims = {name: dimension for name in fork.input_names} lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) transition = SimpleRecurrent(activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention(state_names=transition.apply.states, sequence_dim=2 * dimension, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = DataStreamMapping( mapping=_transpose, data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=_filter_long, data_stream=dataset.get_default_stream()))))) # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) (energies, ) = VariableFilter(application=readout.readout, name="output")(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy( abs(activations).mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule( [StepClipping(10.0), Scale(0.01)])) # More variables for debugging observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring(observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) model = Model(generated) model.set_param_values(load_parameter_values(save_path)) sample_function = model.get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
from blocks.bricks import Tanh, Softmax, Linear, MLP, Identity, Rectifier from blocks.bricks.lookup import LookupTable import data from paramsaveload import SaveLoadParams config = importlib.import_module('.deepmind_attentive_reader', 'config') path = os.path.join(os.getenv("DATAPATH"), "deepmind-qa/cnn/questions/training") vocab_path = os.path.join(os.getenv("DATAPATH"), "deepmind-qa/cnn/stats/training/vocab.txt") ds, stream = data.setup_datastream(path, vocab_path, config) model_path = "~/code/deepmind_qa/deepmind_attentive_reader_epoch2step33900.pkl" m = config.Model(config, ds.vocab_size) model = Model(m.sgd_cost) SaveLoadParams(path=model_path, model=model).do_load() bricks = model.get_top_bricks() print "brick load completed..." def get_prediction_function(): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') """ question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0)
vocabs = pkl.load(f) word_vocab, rel_vocab = vocabs['word'], vocabs['rel'] with open('dataset/trainXY.json') as f: train = json.load(f) train = wrap_stream(train) with open('dataset/testXY.json') as f: test = json.load(f) test = wrap_stream(test) model = LSTMModel(len(vocabs['word']), n_mem, len(vocabs['rel'])) cg = ComputationGraph(model.cost) bricks_model = Model(model.cost) for brick in bricks_model.get_top_bricks(): brick.initialize() model.lookup.W.set_value(vocabs['word'].get_embeddings().astype( theano.config.floatX)) if dropout: pass # logger.info('Applying dropout of {}'.format(dropout)) # lstm_dropout = [v for v in cg.intermediary_variables if v.name in {'W_cell_to_in', 'W_cell_to_out'}] # cg = apply_dropout(cg, lstm_dropout, drop_prob=dropout) # summary of what's going on parameters = bricks_model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape, value.get_value().mean())
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weights_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = chars.shape[1].copy(name="batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in parameters.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations,) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) max_length = chars.shape[0].copy(name="max_length") cost_per_character = aggregation.mean(batch_cost, batch_size * max_length).copy( name="character_log_likelihood") min_energy = energies.min().copy(name="min_energy") max_energy = energies.max().copy(name="max_energy") mean_activation = abs(activations).mean() .copy(name="mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, parameter in parameters.items(): observables.append( parameter.norm(2) .copy(name=name + "_norm")) observables.append( algorithm.gradients[parameter].norm(2) .copy(name=name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition(["after_batch"], _is_nan), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run()