class NoiseExtension(SimpleExtension, RNGMixin): def __init__(self, noise_parameters=None, **kwargs): kwargs.setdefault("before_training", True) kwargs.setdefault("after_training", True) self.noise_parameters = noise_parameters std = 1.0 self.noise_init = IsotropicGaussian(std=std) theano_seed = self.rng.randint(np.iinfo(np.int32).max) self.theano_generator = UnitNoiseGenerator( std=std, theano_seed=theano_seed) self.noise_updates = OrderedDict( [(param, self.theano_generator.apply(param)) for param in self.noise_parameters]) super(NoiseExtension, self).__init__(**kwargs) def do(self, callback_name, *args): self.parse_args(callback_name, args) if callback_name == 'before_training': # Before training, intiaizlize noise for p in self.noise_parameters: self.noise_init.initialize(p, self.rng) # And set up update to change noise on every update self.main_loop.algorithm.add_updates(self.noise_updates) if callback_name == 'after_training': # After training, zero noise again. for p in self.noise_parameters: v = p.get_value() p.set_value(np.zeros(v.shape, dtype=v.dtype))
def __init__(self, noise_parameters=None, **kwargs): kwargs.setdefault("before_training", True) kwargs.setdefault("after_training", True) self.noise_parameters = noise_parameters std = 1.0 self.noise_init = IsotropicGaussian(std=std) theano_seed = self.rng.randint(np.iinfo(np.int32).max) self.theano_generator = UnitNoiseGenerator( std=std, theano_seed=theano_seed) self.noise_updates = OrderedDict( [(param, self.theano_generator.apply(param)) for param in self.noise_parameters]) super(NoiseExtension, self).__init__(**kwargs)
class CustomLSTMWeights(NdarrayInitialization): # Identity in the diagonal and IsotropicGaussian everywhere else def __init__(self, std=1, mean=0): self.gaussian_init = IsotropicGaussian(std = std, mean = mean) self.identity = Identity() def generate(self, rng, shape): if len(shape) != 2: raise ValueError assert shape[0] == shape[1] size = shape/4 assert size*4 == shape[0] result = numpy.array([]) for i in range(4): row = numpy.array([]) for j in range(4): if i == j: square = self.gaussian_init.generate(rng, (size,size)) else: square = self.identity.generate(rng, (size,size)) row = numpy.hstack(row,square) result.vstack(row) return result.astype(theano.config.floatX)
gruFac = PyCheckFactorGruOp() if __name__ == "__main__": theano.config.optimizer='None' import numpy as np from blocks.initialization import IsotropicGaussian, Constant x = tensor.tensor3("inp_variable") #x = tensor.tensor3("inp_variable") n_hid = 512 n_in = 512 np.random.seed(1) rng = np.random init = IsotropicGaussian(0.02) #init = Constant(0.00) inp_to_state = shared_floatx_zeros((n_in, n_hid)) init.initialize(inp_to_state, rng) inp_to_update = shared_floatx_zeros((n_in, n_hid)) init.initialize(inp_to_update, rng) inp_to_reset = shared_floatx_zeros((n_in, n_hid)) init.initialize(inp_to_reset, rng) inp_to_state_b = shared_floatx_zeros((n_hid,)) init.initialize(inp_to_state_b, rng) inp_to_update_b = shared_floatx_zeros((n_hid,)) init.initialize(inp_to_update_b, rng) inp_to_reset_b = shared_floatx_zeros((n_hid,)) init.initialize(inp_to_reset_b, rng)
def main(mode, save_path, num_batches, data_path=None): # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Build bricks encoder = Bidirectional(SimpleRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) fork = Fork( [name for name in encoder.prototype.apply.sequences if name != 'mask'], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.output_dims = {name: dimension for name in fork.input_names} lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) transition = SimpleRecurrent(activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention(state_names=transition.apply.states, sequence_dim=2 * dimension, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = DataStreamMapping( mapping=_transpose, data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=_filter_long, data_stream=dataset.get_default_stream()))))) # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) (energies, ) = VariableFilter(application=readout.readout, name="output")(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy( abs(activations).mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule( [StepClipping(10.0), Scale(0.01)])) # More variables for debugging observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring(observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) model = Model(generated) model.set_param_values(load_parameter_values(save_path)) sample_function = model.get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
# the step rule (uncomment your favorite choice) step_rule = CompositeRule([AdaDelta(), RemoveNotFinite()]) #step_rule = AdaDelta() #step_rule = CompositeRule([Momentum(learning_rate=0.00001, momentum=0.99), RemoveNotFinite()]) #step_rule = CompositeRule([Momentum(learning_rate=0.01, momentum=0.9), RemoveNotFinite()]) #step_rule = CompositeRule([AdaDelta(), Scale(0.01), RemoveNotFinite()]) #step_rule = CompositeRule([RMSProp(learning_rate=0.1, decay_rate=0.95), # RemoveNotFinite()]) #step_rule = CompositeRule([RMSProp(learning_rate=0.001, decay_rate=0.95), # BasicMomentum(momentum=0.9), # RemoveNotFinite()]) #step_rule = Adam() # How the weights are initialized weights_init = IsotropicGaussian(0.01) biases_init = Constant(0.001) # ========================================================================================== # THE MODEL # ========================================================================================== print('Building model ...') bricks = [] dropout_locs = [] # THEANO INPUT VARIABLES eeg = tensor.tensor3('eeg') # batch x time x feature acc = tensor.tensor3('acc') # batch x time x feature label = tensor.lvector('label') # batch
import data from model.memory_network_mlp import Model, Stream n_begin_end_pts = 5 dim_embeddings = [ ('origin_call', data.origin_call_train_size, 10), ('origin_stand', data.stands_size, 10), ('week_of_year', 52, 10), ('day_of_week', 7, 10), ('qhour_of_day', 24 * 4, 10), ('day_type', 3, 10), ] embed_weights_init = IsotropicGaussian(0.001) class MLPConfig(object): __slots__ = ('dim_input', 'dim_hidden', 'dim_output', 'weights_init', 'biases_init', 'embed_weights_init', 'dim_embeddings') prefix_encoder = MLPConfig() prefix_encoder.dim_input = n_begin_end_pts * 2 * 2 + sum( x for (_, _, x) in dim_embeddings) prefix_encoder.dim_hidden = [500] prefix_encoder.weights_init = IsotropicGaussian(0.01) prefix_encoder.biases_init = Constant(0.001) prefix_encoder.embed_weights_init = embed_weights_init prefix_encoder.dim_embeddings = dim_embeddings
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') context_bag = tensor.eq(context[:, :, None], tensor.arange(vocab_size)).sum(axis=1).clip( 0, 1) bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) #embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') #embed.weights_init = Constant(embeddings_initial_value) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate([ cembed, tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0) ], axis=2) clstms, chidden_list = make_bidir_lstm_stack( cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Build the encoder bricks transition = GatedRecurrent(activation=Tanh(), dim=config.generator_lstm_size, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=cenc_dim, match_dim=config.generator_lstm_size, name="attention") readout = Readout(readout_dim=vocab_size, source_names=[ transition.apply.states[0], attention.take_glimpses.outputs[0] ], emitter=MaskedSoftmaxEmitter(context_bag=context_bag, name='emitter'), feedback_brick=LookupFeedback( vocab_size, config.feedback_size), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") cost = generator.cost(answer, answer_mask.astype(theano.config.floatX), attended=cenc, attended_mask=context_mask.astype( theano.config.floatX), name="cost") self.predictions = generator.generate( n_steps=7, batch_size=config.batch_size, attended=cenc, attended_mask=context_mask.astype(theano.config.floatX), iterate=True)[1] # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # initialize new stuff manually (change!) generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0) generator.push_allocation_config() generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) topical_transformer = topicalq_transformer( config['source_topic_vocab_size'], config['topical_embedding_dim'], config['enc_nhids'], config['topical_word_num'], config['batch_size']) decoder = Decoder(vocab_size=config['trg_vocab_size'], topicWord_size=config['trg_topic_vocab_size'], embedding_dim=config['dec_embed'], topical_dim=config['topical_embedding_dim'], state_dim=config['dec_nhids'], representation_dim=config['enc_nhids'] * 2, match_function=config['match_function'], use_doubly_stochastic=config['use_doubly_stochastic'], lambda_ds=config['lambda_ds'], use_local_attention=config['use_local_attention'], window_size=config['window_size'], use_step_decay_cost=config['use_step_decay_cost'], use_concentration_cost=config['use_concentration_cost'], lambda_ct=config['lambda_ct'], use_stablilizer=config['use_stablilizer'], lambda_st=config['lambda_st']) # here attended dim (representation_dim) of decoder is 2*enc_nhinds # because the context given by the encoder is a bidirectional context if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') target_topic_sentence = tensor.lmatrix('target_topic') target_topic_binary_sentence = tensor.lmatrix('target_binary_topic') #target_topic_sentence_mask=tensor.lmatrix('target_topic_mask'); sampling_input = tensor.lmatrix('input') source_topical_word = tensor.lmatrix('source_topical') source_topical_mask = tensor.matrix('source_topical_mask') topic_embedding = topical_transformer.apply(source_topical_word) # Get training and development set streams tr_stream = get_tr_stream_with_topic_target(**config) #dev_stream = get_dev_tr_stream_with_topic_target(**config) # Get cost of the model representations = encoder.apply(source_sentence, source_sentence_mask) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representations[0, :, (representations.shape[2] / 2):] cost = decoder.cost(representations, source_sentence_mask, tw_representation, source_topical_mask, target_sentence, target_sentence_mask, target_topic_sentence, target_topic_binary_sentence, topic_embedding, content_embedding) logger.info('Creating computational graph') perplexity = tensor.exp(cost) perplexity.name = 'perplexity' cg = ComputationGraph(cost) costs_computer = function([ target_sentence, target_sentence_mask, source_sentence, source_sentence_mask, source_topical_word, target_topic_sentence, target_topic_binary_sentence ], (perplexity), on_unused_input='ignore') # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() topical_transformer.weights_init = IsotropicGaussian( config['weight_scale']) topical_transformer.biases_init = Constant(0) topical_transformer.push_allocation_config() #don't know whether the initialize is for topical_transformer.look_up.weights_init = Orthogonal() topical_transformer.transformer.weights_init = Orthogonal() topical_transformer.initialize() word_topical_embedding = cPickle.load( open(config['topical_embeddings'], 'rb')) np_word_topical_embedding = numpy.array(word_topical_embedding, dtype='float32') topical_transformer.look_up.W.set_value(np_word_topical_embedding) topical_transformer.look_up.W.tag.role = [] # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([perplexity], after_batch=True), CheckpointNMT(config['saveto'], config['model_name'], every_n_batches=config['save_freq']) ] # # Set up beam search and sampling computation graphs if necessary # if config['hook_samples'] >= 1 or config['bleu_script'] is not None: # logger.info("Building sampling model") # sampling_representation = encoder.apply( # sampling_input, tensor.ones(sampling_input.shape)) # generated = decoder.generate( # sampling_input, sampling_representation) # search_model = Model(generated) # _, samples = VariableFilter( # bricks=[decoder.sequence_generator], name="outputs")( # ComputationGraph(generated[1])) # # # Add sampling # if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # model_name=config['model_name'], # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # # # Add early stopping based on bleu # if False: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'], # n_best=3, # track_n_models=6)) # # logger.info("Building perplexity validator") # extensions.append( # pplValidation( config=config, # model=costs_computer, data_stream=dev_stream, # model_name=config['model_name'], # every_n_batches=config['sampling_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) initial_learning_rate = config['initial_learning_rate'] log_path = os.path.join(config['saveto'], 'log') if config['reload'] and os.path.exists(log_path): with open(log_path, 'rb') as source: log = cPickle.load(source) last = max(log.keys()) - 1 if 'learning_rate' in log[last]: initial_learning_rate = log[last]['learning_rate'] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ Scale(initial_learning_rate), StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='ignore') _learning_rate = algorithm.step_rule.components[0].learning_rate if config['learning_rate_decay']: extensions.append( LearningRateHalver(record_name='validation_cost', comparator=lambda x, y: x > y, learning_rate=_learning_rate, patience_default=3)) else: extensions.append(OldModelRemover(saveto=config['saveto'])) if config['learning_rate_grow']: extensions.append( LearningRateDoubler(record_name='validation_cost', comparator=lambda x, y: x < y, learning_rate=_learning_rate, patience_default=3)) extensions.append( SimplePrinting(config['model_name'], after_batch=True)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') source_topical_word = tensor.lmatrix('source_topical') tw_vocab_overlap = tensor.lmatrix('tw_vocab_overlap') tw_vocab_overlap_matrix = cPickle.load( open(config['tw_vocab_overlap'], 'rb')) tw_vocab_overlap_matrix = numpy.array(tw_vocab_overlap_matrix, dtype='int32') #tw_vocab_overlap=shared(tw_vocab_overlap_matrix); topic_embedding = topical_transformer.apply(source_topical_word) sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 trg_vocab = _ensure_special_tokens(cPickle.load( open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) topic_embedding = topical_transformer.apply(source_topical_word) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = sampling_representation[0, :, ( sampling_representation.shape[2] / 2):] generated = decoder.generate(sampling_input, sampling_representation, tw_representation, topical_embedding=topic_embedding, content_embedding=content_embedding) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']) loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started translation: ") test_stream = get_dev_stream_with_topicalq(**config) ts = test_stream.get_epoch_iterator() rts = open(config['val_set_source']).readlines() ftrans_original = open(config['val_output_orig'], 'w') saved_weights = [] total_cost = 0.0 pbar = ProgressBar(max_value=len(rts)).start() for i, (line, line_raw) in enumerate(zip(ts, rts)): trans_in = line_raw.split() seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) seq1 = line[1] input_topical = numpy.tile(seq1, (config['beam_size'], 1)) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs, attendeds, weights = \ beam_search.search( input_values={sampling_input: input_,source_topical_word:input_topical,tw_vocab_overlap:tw_vocab_overlap_matrix}, tw_vocab_overlap=tw_vocab_overlap_matrix, max_length=3*len(seq), eol_symbol=trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] weight = weights[best][:, :len(trans_in)] trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' saved_weights.append(weight) print(' '.join(trans_out), file=ftrans_original) pbar.update(i + 1) pbar.finish() logger.info("Total cost of the test: {}".format(total_cost)) cPickle.dump(saved_weights, open(config['attention_weights'], 'wb')) ftrans_original.close() # ap = afterprocesser(config) # ap.main() elif mode == 'score': logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') target_topic_sentence = tensor.lmatrix('target_topic') target_topic_binary_sentence = tensor.lmatrix('target_binary_topic') source_topical_word = tensor.lmatrix('source_topical') topic_embedding = topical_transformer.apply(source_topical_word) # Get cost of the model representations = encoder.apply(source_sentence, source_sentence_mask) costs = decoder.cost(representations, source_sentence_mask, target_sentence, target_sentence_mask, target_topic_sentence, target_topic_binary_sentence, topic_embedding) config['batch_size'] = 1 config['sort_k_batches'] = 1 # Get test set stream test_stream = get_tr_stream_with_topic_target(**config) logger.info("Building sampling model") logger.info("Loading the model..") model = Model(costs) loader = LoadNMT(config['validation_load']) loader.set_model_parameters(model, loader.load_parameters_default()) costs_computer = function([ target_sentence, target_sentence_mask, source_sentence, source_sentence_mask, source_topical_word, target_topic_sentence, target_topic_binary_sentence ], (costs), on_unused_input='ignore') iterator = test_stream.get_epoch_iterator() scores = [] att_weights = [] for i, (src, src_mask, trg, trg_mask, te, te_mask, tt, tt_mask, tb, tb_mask) in enumerate(iterator): costs = costs_computer(*[trg, trg_mask, src, src_mask, te, tt, tb]) cost = costs.sum() print(i, cost) scores.append(cost) print(sum(scores) / 10007)
attended_dim=100, #or is it 68 match_dim=30, name="attention") readout = Readout(readout_dim=readout_size, source_names=source_names + [attention.take_glimpses.outputs[0]], emitter=emitter, name="readout") generator = SequenceGenerator(readout=readout, attention=attention, transition=transition, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.001) generator.push_initialization_config() lookup.weights_init = IsotropicGaussian(0.01) lookup.biases_init = Constant(0.001) lookup.initialize() #generator.transition.weights_init = initialization.Identity(0.98) #generator.transition.biases_init = IsotropicGaussian(0.01,0.9) generator.transition.push_initialization_config() generator.initialize() cost_matrix = generator.cost_matrix(x, x_mask, attended=embed,
def test_rldraw_classic(step_type='add', use_pol=True): ########################################### # Make a tag for identifying result files # ########################################### pol_tag = "yp" if use_pol else "np" res_tag = "TRLD_SPLIT_E002_{}_{}".format(step_type, pol_tag) ########################## # Get some training data # ########################## rng = np.random.RandomState(1234) Xtr, Xva, Xte = load_binarized_mnist(data_path='./data/') Xtr = np.vstack((Xtr, Xva)) Xva = Xte #del Xte tr_samples = Xtr.shape[0] va_samples = Xva.shape[0] batch_size = 200 ############################################################ # Setup some parameters for the Iterative Refinement Model # ############################################################ x_dim = Xtr.shape[1] write_dim = 500 rnn_dim = 500 z_dim = 100 n_iter = 20 rnninits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } # setup reader/writer models read_dim = 2*x_dim reader_mlp = Reader(x_dim=x_dim, dec_dim=rnn_dim, **inits) writer_mlp = MLP([None, None], [rnn_dim, write_dim, x_dim], name="writer_mlp", **inits) # setup submodels for processing LSTM inputs pol_mlp_in = MLP([Identity()], [rnn_dim, 4*rnn_dim], name="pol_mlp_in", **inits) var_mlp_in = MLP([Identity()], [(x_dim + rnn_dim), 4*rnn_dim], name="var_mlp_in", **inits) ent_mlp_in = MLP([Identity()], [(x_dim + rnn_dim), 4*rnn_dim], name="ent_mlp_in", **inits) dec_mlp_in = MLP([Identity()], [z_dim, 4*rnn_dim], name="dec_mlp_in", **inits) # setup submodels for turning LSTM states into conditionals over z pol_mlp_out = CondNet([], [rnn_dim, z_dim], name="pol_mlp_out", **inits) var_mlp_out = CondNet([], [rnn_dim, z_dim], name="var_mlp_out", **inits) ent_mlp_out = CondNet([], [rnn_dim, z_dim], name="ent_mlp_out", **inits) dec_mlp_out = CondNet([], [rnn_dim, z_dim], name="dec_mlp_out", **inits) # setup the LSTMs for primary policy, guide policy, and shared dynamics pol_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="pol_rnn", **rnninits) var_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="var_rnn", **rnninits) ent_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="ent_rnn", **rnninits) dec_rnn = BiasedLSTM(dim=rnn_dim, ig_bias=2.0, fg_bias=2.0, \ name="dec_rnn", **rnninits) draw = RLDrawModel( n_iter, step_type=step_type, # step_type can be 'add' or 'jump' use_pol=use_pol, reader_mlp=reader_mlp, writer_mlp=writer_mlp, pol_mlp_in=pol_mlp_in, pol_mlp_out=pol_mlp_out, pol_rnn=pol_rnn, var_mlp_in=var_mlp_in, var_mlp_out=var_mlp_out, var_rnn=var_rnn, dec_mlp_in=dec_mlp_in, dec_mlp_out=dec_mlp_out, dec_rnn=dec_rnn, ent_mlp_in=ent_mlp_in, ent_mlp_out=ent_mlp_out, ent_rnn=ent_rnn) draw.initialize() compile_start_time = time.time() # build the cost gradients, training function, samplers, etc. draw.build_sampling_funcs() print("Testing model sampler...") # draw some independent samples from the model samples = draw.sample_model(Xtr[:65,:], sample_source='p') n_iter, N, D = samples.shape samples = samples.reshape( (n_iter, N, 28, 28) ) for j in xrange(n_iter): img = img_grid(samples[j,:,:,:]) img.save("%s_samples_%03d.png" % (res_tag, j)) draw.build_model_funcs() compile_end_time = time.time() compile_minutes = (compile_end_time - compile_start_time) / 60.0 print("THEANO COMPILE TIME (MIN): {}".format(compile_minutes)) ################################################################ # Apply some updates, to check that they aren't totally broken # ################################################################ print("Beginning to train the model...") out_file = open("{}_results.txt".format(res_tag), 'wb') costs = [0. for i in range(10)] learn_rate = 0.00015 momentum = 0.9 batch_idx = np.arange(batch_size) + tr_samples for i in range(300000): scale = min(1.0, ((i+1) / 5000.0)) if (((i + 1) % 10000) == 0): learn_rate = learn_rate * 0.95 # get the indices of training samples for this batch update batch_idx += batch_size if (np.max(batch_idx) >= tr_samples): # we finished an "epoch", so we rejumble the training set Xtr = row_shuffle(Xtr) batch_idx = np.arange(batch_size) # set sgd and objective function hyperparams for this update draw.set_sgd_params(lr=scale*learn_rate, mom_1=scale*momentum, mom_2=0.98) draw.set_lam_kld(lam_kld_q2p=1.0, lam_kld_p2q=0.0, lam_neg_ent=0.02) draw.set_grad_noise(grad_noise=0.02) # perform a minibatch update and record the cost for this batch Xb = to_fX(Xtr.take(batch_idx, axis=0)) result = draw.train_joint(Xb) costs = [(costs[j] + result[j]) for j in range(len(result))] # diagnostics if ((i % 250) == 0): costs = [(v / 250.0) for v in costs] str1 = "-- batch {0:d} --".format(i) str2 = " total_cost: {0:.4f}".format(costs[0]) str3 = " nll_bound : {0:.4f}".format(costs[1]) str4 = " nll_term : {0:.4f}".format(costs[2]) str5 = " kld_q2p : {0:.4f}".format(costs[3]) str6 = " kld_p2q : {0:.4f}".format(costs[4]) str7 = " neg_ent : {0:.4f}".format(costs[5]) str8 = " reg_term : {0:.4f}".format(costs[6]) joint_str = "\n".join([str1, str2, str3, str4, str5, str6, str7, str8]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() costs = [0.0 for v in costs] if ((i % 1000) == 0): draw.save_model_params("{}_params.pkl".format(res_tag)) # compute a small-sample estimate of NLL bound on validation set Xva = row_shuffle(Xva) Xb = to_fX(Xva[:5000]) va_costs = draw.compute_nll_bound(Xb) str1 = " va_nll_bound : {}".format(va_costs[1]) str2 = " va_nll_term : {}".format(va_costs[2]) str3 = " va_kld_q2p : {}".format(va_costs[3]) str4 = " va_neg_ent : {}".format(va_costs[5]) joint_str = "\n".join([str1, str2, str3, str4]) print(joint_str) out_file.write(joint_str+"\n") out_file.flush() # draw some independent samples from the model samples = draw.sample_model(Xb[:256,:], sample_source='p') n_iter, N, D = samples.shape samples = samples.reshape( (n_iter, N, 28, 28) ) for j in xrange(n_iter): img = img_grid(samples[j,:,:,:]) img.save("%s_samples_%03d.png" % (res_tag, j))
def main(name, epochs, batch_size, learning_rate): if name is None: name = "att-rw" print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print() #------------------------------------------------------------------------ img_height, img_width = 28, 28 read_N = 12 write_N = 14 inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } x_dim = img_height * img_width reader = ZoomableAttentionWindow(img_height, img_width, read_N) writer = ZoomableAttentionWindow(img_height, img_width, write_N) # Parameterize the attention reader and writer mlpr = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="RMLP", **inits) mlpw = MLP(activations=[Tanh(), Identity()], dims=[x_dim, 50, 5], name="WMLP", **inits) # MLP between the reader and writer mlp = MLP(activations=[Tanh(), Identity()], dims=[read_N**2, 300, write_N**2], name="MLP", **inits) for brick in [mlpr, mlpw, mlp]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') hr = mlpr.apply(x) hw = mlpw.apply(x) center_y, center_x, delta, sigma, gamma = reader.nn2att(hr) r = reader.read(x, center_y, center_x, delta, sigma) h = mlp.apply(r) center_y, center_x, delta, sigma, gamma = writer.nn2att(hw) c = writer.write(h, center_y, center_x, delta, sigma) / gamma x_recons = T.nnet.sigmoid(c) cost = BinaryCrossEntropy().apply(x, x_recons) cost.name = "cost" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ RemoveNotFinite(), Adam(learning_rate), StepClipping(3.), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] #for v in [center_y, center_x, log_delta, log_sigma, log_gamma]: # v_mean = v.mean() # v_mean.name = v.name # monitors += [v_mean] # monitors += [aggregation.mean(v)] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["cost"], ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=Model(cost), data_stream=ForceFloatX( DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX( DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), prefix="test"), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name + ".pkl"), #Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def build_theano_functions(self): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor3('y') z = T.ftensor3('z') layers_input = [x] dims = np.array([self.input_dim]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear( dims[layer], dims[layer + 1] * 4, #weights_init=Uniform(mean=data_mean, std=1), weights_init=IsotropicGaussian(mean=1., std=1), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value(Orthogonal().generate( np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # the idea is to have one gaussian parametrize every frequency bin print "Last linear transform dim :", dims[1:].sum() output_transform = Linear( dims[1:].sum(), self.output_dim, weights_init=IsotropicGaussian(mean=0., std=1), biases_init=Constant(0), #use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) sig = T.nnet.relu(y_hat[:, :, :self.output_dim / 2]) + 0.05 mus = y_hat[:, :, self.output_dim / 2:] # sum likelihood with targets # sum inside log accross mixtures, sum outside log accross time inside_expo = -0.5 * ((y - mus)**2) / sig**2 expo = T.exp(inside_expo) coeff = 1. / (T.sqrt(2. * np.pi) * sig) inside_log = T.log(coeff * expo) inside_log_max = T.max(inside_log, axis=2, keepdims=True) LL = -(inside_log_max + T.log( T.sum(T.exp(inside_log - inside_log_max), axis=2, keepdims=True))).sum() #zinside_expo = -0.5*((z-mus)**2)/sig**2 #zexpo = T.exp(zinside_expo) #zcoeff = pis*(1./(T.sqrt(2.*np.pi)*sig)) #zinside_log = (zcoeff*zexpo).sum(axis=2) #zLL = -(T.log(zinside_log)).sum() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)): #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append( tuple([parameters[i], parameters[i] - lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug: gradf = theano.function([x, y, lr], [LL, mus, sig], updates=updates) else: #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr], [LL], updates=updates) f = theano.function([x], [sig, mus]) return gradf, f
def _pokemon_dcgan(): inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.) } batch_size = 20 data_train = PokemonGenYellowNormal(which_sets=['train'], sources=['features']) train_stream = Flatten(DataStream.default_stream( data_train, iteration_scheme=SequentialScheme( data_train.num_examples, batch_size))) features_size = 56 * 56 * 1 inputs = T.matrix('features') inputs = (inputs)/255. * 2. - 1. # rng = MRG_RandomStreams(123) # inputs = inputs * rng.binomial(size=inputs.shape, p=0.1) prior = Z_prior(dim=256) gen = Generator(input_dim=256, dims=[128, 64, 64, features_size], alpha=0.1, **inits) dis = Discriminator(dims=[features_size, 128, 64, 64], alpha=0.1, **inits) gan = GAN(dis=dis, gen=gen, prior=prior) gan.initialize() y_hat1, y_hat0, z = gan.apply(inputs) model = Model([y_hat0, y_hat1]) loss = WGANLoss() dis_obj, gen_obj = loss.apply(y_hat0, y_hat1) dis_obj.name = 'Discriminator loss' gen_obj.name = 'Generator loss' cg = ComputationGraph([gen_obj, dis_obj]) gen_filter = VariableFilter(roles=[PARAMETER], bricks=gen.linear_transformations) dis_filter = VariableFilter(roles=[PARAMETER], bricks=dis.linear_transformations) gen_params = gen_filter(cg.variables) dis_params = dis_filter(cg.variables) # Prepare the dropout _inputs = [] for brick_ in [gen]: _inputs.extend(VariableFilter(roles=[INPUT], bricks=brick_.linear_transformations)( cg.variables)) cg_dropout = apply_dropout(cg, _inputs, 0.02) gen_obj = cg_dropout.outputs[0] dis_obj = cg_dropout.outputs[1] gan.dis_params = dis_params gan.gen_params = gen_params algo = AdverserialTraning(gen_obj=gen_obj, dis_obj=dis_obj, model=gan, dis_iter=5, step_rule=RMSProp(learning_rate=1e-4), gen_consider_constant=z) neg_sample = gan.sampling(size=25) monitor = TrainingDataMonitoring(variables=[gen_obj, dis_obj], prefix="train", after_batch=True) subdir = './exp/' + 'pokemon' + "-" + time.strftime("%Y%m%d-%H%M%S") check_point = Checkpoint("{}/{}".format(subdir, 'pokemon'), every_n_epochs=100, save_separately=['log', 'model']) neg_sampling = GenerateNegtiveSample(neg_sample, img_size=(25, 56, 56), every_n_epochs=100) if not os.path.exists(subdir): os.makedirs(subdir) main_loop = MainLoop(algorithm=algo, model=model, data_stream=train_stream, extensions=[Printing(), ProgressBar(), monitor, check_point, neg_sampling]) main_loop.run()
def _pokemon_wgan_gp(): import os os.environ["FUEL_DATA_PATH"] = os.getcwd() + "/data/" batch_size = 20 data_train = PokemonGenYellowNormal(which_sets=['train'], sources=['features']) train_stream = Flatten(DataStream.default_stream( data_train, iteration_scheme=SequentialScheme( data_train.num_examples, batch_size))) features_size = 56 * 56 * 1 inits = { 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.) } # print train_stream.get_epoch_iterator(as_dict=True).next() # raise inputs = T.matrix('features') inputs = ((inputs / 255.) * 2. - 1.) rng = MRG_RandomStreams(123) prior = Z_prior(dim=512) gen = Generator(input_dim=512, dims=[512, 512, 512, 512, features_size], alpha=0.1, **inits) dis = Discriminator(dims=[features_size, 512, 512 , 512, 512], alpha=0.1, **inits) gan = GAN(dis=dis, gen=gen, prior=prior) gan.initialize() # gradient penalty fake_samples, _ = gan.sampling(inputs.shape[0]) e = rng.uniform(size=(inputs.shape[0], 1)) mixed_input = (e * fake_samples) + (1 - e) * inputs output_d_mixed = gan._dis.apply(mixed_input) grad_mixed = T.grad(T.sum(output_d_mixed), mixed_input) norm_grad_mixed = T.sqrt(T.sum(T.square(grad_mixed), axis=1)) grad_penalty = T.mean(T.square(norm_grad_mixed -1)) y_hat1, y_hat0, z = gan.apply(inputs) d_loss_real = y_hat1.mean() d_loss_fake = y_hat0.mean() d_loss = - d_loss_real + d_loss_fake + 10 * grad_penalty g_loss = - d_loss_fake dis_obj = d_loss gen_obj = g_loss model = Model([y_hat0, y_hat1]) em_loss = -d_loss_real + d_loss_fake em_loss.name = "Earth Move loss" dis_obj.name = 'Discriminator loss' gen_obj.name = 'Generator loss' cg = ComputationGraph([gen_obj, dis_obj]) gen_filter = VariableFilter(roles=[PARAMETER], bricks=gen.linear_transformations) dis_filter = VariableFilter(roles=[PARAMETER], bricks=dis.linear_transformations) gen_params = gen_filter(cg.variables) dis_params = dis_filter(cg.variables) # Prepare the dropout _inputs = [] for brick_ in [gen]: _inputs.extend(VariableFilter(roles=[INPUT], bricks=brick_.linear_transformations)(cg.variables)) cg_dropout = apply_dropout(cg, _inputs, 0.02) gen_obj = cg_dropout.outputs[0] dis_obj = cg_dropout.outputs[1] gan.dis_params = dis_params gan.gen_params = gen_params # gradient penalty algo = AdverserialTraning(gen_obj=gen_obj, dis_obj=dis_obj, model=gan, dis_iter=5, gradient_clip=None, step_rule=RMSProp(learning_rate=1e-4), gen_consider_constant=z) neg_sample = gan.sampling(size=25) from blocks.monitoring.aggregation import mean monitor = TrainingDataMonitoring(variables=[mean(gen_obj), mean(dis_obj), mean(em_loss)], prefix="train", after_batch=True) subdir = './exp/' + 'pokemon-wgan-gp' + "-" + time.strftime("%Y%m%d-%H%M%S") check_point = Checkpoint("{}/{}".format(subdir, 'CIFAR10'), every_n_epochs=100, save_separately=['log', 'model']) neg_sampling = GenerateNegtiveSample(neg_sample, img_size=(25, 56, 56), every_n_epochs=10) if not os.path.exists(subdir): os.makedirs(subdir) main_loop = MainLoop(algorithm=algo, model=model, data_stream=train_stream, extensions=[Printing(), ProgressBar(), monitor, check_point, neg_sampling]) main_loop.run()
def main(config, ensemble, tr_stream, dev_stream, use_bokeh=False): # Create Theano variables. logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model. logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( # End_embed is dimension of word embedding matrix in encoder; enc_nhids number of hidden units in encoder GRU. config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) testVar = decoder.getTestVar( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model. logger.info('Initializing model') my_rng = numpy.random.RandomState(config['rng_value'] * ensemble) if config['identity_init'] or config['all_identity_init']: encoder.weights_init = decoder.weights_init = Identity() else: encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.rng = decoder.rng = my_rng encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() if config['all_identity_init']: decoder.transition.weights_init = Identity() else: decoder.transition.weights_init = Orthogonal() decoder.transition.rng = my_rng encoder.initialize() decoder.initialize() # Apply dropout for regularization. if config['dropout'] < 1.0: # Dropout is applied to the output of maxout in ghog. logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization. if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'], seed=my_rng) # Print shapes. shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names. enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model. logger.info("Building model") training_model = Model(cost) # Set extensions. logger.info("Initializing extensions") if config['early_stopping']: extensions = [ FinishAfter(after_n_epochs=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: extensions = [ FinishAfter(after_n_epochs=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary. if config['hook_samples'] >= 1: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling. if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=8)) if config['val_set'] is not None: logger.info("Building accuracy validator") extensions.append( AccuracyValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, after_training=True, after_epoch=True)) else: logger.info("No validation set given for this language") # Reload model if necessary. if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Set up training algorithm. logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop. logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def mainPredict(config, data_to_predict_stream, use_ensemble, lang=None, the_task=None, et_version=False, use_bokeh=False): # Create Theano variables. logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model. logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model. logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Print shapes. shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names. enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set extensions. logger.info("Initializing (empty) extensions") extensions = [ ] logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Reload the model (as this is prediction, it is 100% necessary). if config['reload']: extensions.append(LoadOnlyModel(config['saveto'])) else: raise Exception('No model available for prediction! (Check config[\'reload\'] variable)') # Set up training algorithm. logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop. logger.info("Initializing main loop") main_loop = MainLoop( model=search_model, algorithm=algorithm, data_stream=data_to_predict_stream, extensions=extensions ) predictByHand(main_loop, decoder, data_to_predict_stream, use_ensemble, lang, the_task, et_version, config)
def main(self): import itertools import numpy from theano import tensor from blocks.algorithms import Adam from blocks.bricks import MLP, Rectifier, Identity, LinearMaxout, Linear from blocks.bricks.bn import BatchNormalization from blocks.bricks.sequences import Sequence from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring from blocks.extensions.saveload import Checkpoint from blocks.graph import ComputationGraph, apply_dropout from blocks.graph.bn import (batch_normalization, get_batch_normalization_updates) from blocks.filter import VariableFilter from blocks.initialization import IsotropicGaussian, Constant from blocks.model import Model from blocks.main_loop import MainLoop from blocks.roles import INPUT from ali.algorithms import ali_algorithm from ali.streams import create_gaussian_mixture_data_streams from ali.bricks import (ALI, COVConditional, DeterministicConditional, XZJointDiscriminator) from ali.utils import as_array from blocks.select import Selector import logging import argparse from pacgan.extensions import ModelLogger, GraphLogger, MetricLogger import fuel seed = random.randint(1, 100000) fuelrc_path = os.path.join(self._work_dir, ".fuelrc") f = open(fuelrc_path, "w") f.write("default_seed: {}\n".format(seed)) f.close() fuel.config.default_seed = seed INPUT_DIM = 2 NLAT = 2 GEN_HIDDEN = 400 DISC_HIDDEN = 200 GEN_ACTIVATION = Rectifier MAXOUT_PIECES = 5 GAUSSIAN_INIT = IsotropicGaussian(std=0.02) ZERO_INIT = Constant(0.0) NUM_EPOCHS = 400 LEARNING_RATE = 1e-4 BETA1 = 0.8 BATCH_SIZE = 100 MONITORING_BATCH_SIZE = 500 MEANS = [ numpy.array([i, j]) for i, j in itertools.product(range(-4, 5, 2), range(-4, 5, 2)) ] VARIANCES = [0.05**2 * numpy.eye(len(mean)) for mean in MEANS] PRIORS = None def create_model_brick(): encoder_mapping = MLP( dims=[2 * INPUT_DIM, GEN_HIDDEN, GEN_HIDDEN, NLAT], activations=[ Sequence([ BatchNormalization(GEN_HIDDEN).apply, GEN_ACTIVATION().apply ], name='encoder_h1'), Sequence([ BatchNormalization(GEN_HIDDEN).apply, GEN_ACTIVATION().apply ], name='encoder_h2'), Identity(name='encoder_out') ], use_bias=False, name='encoder_mapping') encoder = COVConditional(encoder_mapping, (INPUT_DIM, ), name='encoder') decoder_mapping = MLP(dims=[ NLAT, GEN_HIDDEN, GEN_HIDDEN, GEN_HIDDEN, GEN_HIDDEN, INPUT_DIM ], activations=[ Sequence([ BatchNormalization(GEN_HIDDEN).apply, GEN_ACTIVATION().apply ], name='decoder_h1'), Sequence([ BatchNormalization(GEN_HIDDEN).apply, GEN_ACTIVATION().apply ], name='decoder_h2'), Sequence([ BatchNormalization(GEN_HIDDEN).apply, GEN_ACTIVATION().apply ], name='decoder_h3'), Sequence([ BatchNormalization(GEN_HIDDEN).apply, GEN_ACTIVATION().apply ], name='decoder_h4'), Identity(name='decoder_out') ], use_bias=False, name='decoder_mapping') decoder = DeterministicConditional(decoder_mapping, name='decoder') x_discriminator = Identity(name='x_discriminator') z_discriminator = Identity(name='z_discriminator') joint_discriminator = Sequence(application_methods=[ LinearMaxout(input_dim=INPUT_DIM + NLAT, output_dim=DISC_HIDDEN, num_pieces=MAXOUT_PIECES, weights_init=GAUSSIAN_INIT, biases_init=ZERO_INIT, name='discriminator_h1').apply, LinearMaxout(input_dim=DISC_HIDDEN, output_dim=DISC_HIDDEN, num_pieces=MAXOUT_PIECES, weights_init=GAUSSIAN_INIT, biases_init=ZERO_INIT, name='discriminator_h2').apply, LinearMaxout(input_dim=DISC_HIDDEN, output_dim=DISC_HIDDEN, num_pieces=MAXOUT_PIECES, weights_init=GAUSSIAN_INIT, biases_init=ZERO_INIT, name='discriminator_h3').apply, Linear(input_dim=DISC_HIDDEN, output_dim=1, weights_init=GAUSSIAN_INIT, biases_init=ZERO_INIT, name='discriminator_out').apply ], name='joint_discriminator') discriminator = XZJointDiscriminator(x_discriminator, z_discriminator, joint_discriminator, name='discriminator') ali = ALI(encoder=encoder, decoder=decoder, discriminator=discriminator, weights_init=GAUSSIAN_INIT, biases_init=ZERO_INIT, name='ali') ali.push_allocation_config() encoder_mapping.linear_transformations[-1].use_bias = True decoder_mapping.linear_transformations[-1].use_bias = True ali.initialize() print("Number of parameters in discriminator: {}".format( numpy.sum([ numpy.prod(v.shape.eval()) for v in Selector( ali.discriminator).get_parameters().values() ]))) print("Number of parameters in encoder: {}".format( numpy.sum([ numpy.prod(v.shape.eval()) for v in Selector(ali.encoder).get_parameters().values() ]))) print("Number of parameters in decoder: {}".format( numpy.sum([ numpy.prod(v.shape.eval()) for v in Selector(ali.decoder).get_parameters().values() ]))) return ali def create_models(): ali = create_model_brick() x = tensor.matrix('features') z = ali.theano_rng.normal(size=(x.shape[0], NLAT)) def _create_model(with_dropout): cg = ComputationGraph(ali.compute_losses(x, z)) if with_dropout: inputs = VariableFilter(bricks=ali.discriminator. joint_discriminator.children[1:], roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.5) inputs = VariableFilter( bricks=[ali.discriminator.joint_discriminator], roles=[INPUT])(cg.variables) cg = apply_dropout(cg, inputs, 0.2) return Model(cg.outputs) model = _create_model(with_dropout=False) with batch_normalization(ali): bn_model = _create_model(with_dropout=False) pop_updates = list( set( get_batch_normalization_updates(bn_model, allow_duplicates=True))) bn_updates = [(p, m * 0.05 + p * 0.95) for p, m in pop_updates] return model, bn_model, bn_updates def create_main_loop(): model, bn_model, bn_updates = create_models() ali, = bn_model.top_bricks discriminator_loss, generator_loss = bn_model.outputs step_rule = Adam(learning_rate=LEARNING_RATE, beta1=BETA1) algorithm = ali_algorithm(discriminator_loss, ali.discriminator_parameters, step_rule, generator_loss, ali.generator_parameters, step_rule) algorithm.add_updates(bn_updates) streams = create_gaussian_mixture_data_streams( batch_size=BATCH_SIZE, monitoring_batch_size=MONITORING_BATCH_SIZE, means=MEANS, variances=VARIANCES, priors=PRIORS) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams bn_monitored_variables = ([ v for v in bn_model.auxiliary_variables if 'norm' not in v.name ] + bn_model.outputs) monitored_variables = ( [v for v in model.auxiliary_variables if 'norm' not in v.name] + model.outputs) extensions = [ Timing(), FinishAfter(after_n_epochs=NUM_EPOCHS), DataStreamMonitoring(bn_monitored_variables, train_monitor_stream, prefix="train", updates=bn_updates), DataStreamMonitoring(monitored_variables, valid_monitor_stream, prefix="valid"), Checkpoint(os.path.join(self._work_dir, "main_loop.tar"), after_epoch=True, after_training=True, use_cpickle=True), ProgressBar(), Printing(), #ModelLogger(folder=self._work_dir, after_epoch=True), GraphLogger(num_modes=1, num_samples=2500, dimension=2, r=0, std=1, folder=self._work_dir, after_epoch=True, after_training=True), MetricLogger(means=MEANS, variances=VARIANCES, folder=self._work_dir, after_epoch=True) ] main_loop = MainLoop(model=bn_model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) return main_loop main_loop = create_main_loop() main_loop.run()
def test_sequence_generator(): """Test a sequence generator with no contexts and continuous outputs. Such sequence generators can be used to model e.g. dynamical systems. """ floatX = theano.config.floatX rng = numpy.random.RandomState(1234) output_dim = 1 dim = 20 batch_size = 30 n_steps = 10 transition = SimpleRecurrent(activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator(Readout(readout_dim=output_dim, source_names=["states"], emitter=TestEmitter()), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0.0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.tensor3('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 y_test = rng.uniform(size=(n_steps, batch_size, output_dim)).astype(floatX) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = theano.function([y, mask], [costs])(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 115.593, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 3.8531, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join( [generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [ el for el in var_filter(cg.variables) if el.name == aux_var_name ][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 0.38531, rtol=1e-5) # Test 'generate' method states, outputs, costs = [ variable.eval() for variable in generator.generate(states=rng.uniform( size=(batch_size, dim)).astype(floatX), iterate=True, batch_size=batch_size, n_steps=n_steps) ] assert states.shape == (n_steps, batch_size, dim) assert outputs.shape == (n_steps, batch_size, output_dim) assert costs.shape == (n_steps, batch_size) assert_allclose(outputs.sum(), -0.33683, rtol=1e-5) assert_allclose(states.sum(), 15.7909, rtol=1e-5) # There is no generation cost in this case, since generation is # deterministic assert_allclose(costs.sum(), 0.0)
def main(mode, save_path, steps, num_batches): num_states = MarkovChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator(Readout( readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( MarkovChainDataset.entropy)) logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len)) # Build the cost computation graph. x = tensor.lmatrix('data') cost = aggregation.mean( generator.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" algorithm = GradientDescent( cost=cost, params=list(Selector(generator).get_params().values()), step_rule=Scale(0.001)) main_loop = MainLoop(algorithm=algorithm, data_stream=DataStream( MarkovChainDataset(rng, seq_len), iteration_scheme=ConstantScheme(batch_size)), model=Model(cost), extensions=[ FinishAfter(after_n_batches=num_batches), TrainingDataMonitoring([cost], prefix="this_step", after_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), Checkpoint(save_path, every_n_batches=500), Printing(every_n_batches=100) ]) main_loop.run() elif mode == "sample": main_loop = cPickle.load(open(save_path, "rb")) generator = main_loop.model sample = ComputationGraph( generator.generate(n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, MarkovChainDataset.trans_prob)) else: assert False
import numpy import theano import unittest from blocks.bricks import MLP, Logistic from blocks.initialization import IsotropicGaussian, Constant from helmholtz.prob_layers import * floatX = theano.config.floatX inits = { 'weights_init': IsotropicGaussian(0.1), 'biases_init': Constant(-1.0), } #--------------------------------------------------------------------------- def test_benoulli_top_layer(): # Setup layer dim_x = 100 l = BernoulliTopLayer(dim_x, name="layer", **inits) l.initialize() n_samples = tensor.iscalar('n_samples') x_expected = l.sample_expected() x, x_log_prob = l.sample(n_samples) do = theano.function([n_samples], [x_expected, x, x_log_prob],
from blocks.roles import INPUT from theano import tensor from ali.algorithms import ali_algorithm from ali.bricks import (ALI, GaussianConditional, DeterministicConditional, XZJointDiscriminator) from ali.streams import create_tiny_imagenet_data_streams from ali.utils import get_log_odds, conv_brick, conv_transpose_brick, bn_brick BATCH_SIZE = 128 MONITORING_BATCH_SIZE = 128 NUM_EPOCHS = 1000 IMAGE_SIZE = (64, 64) NUM_CHANNELS = 3 NLAT = 256 GAUSSIAN_INIT = IsotropicGaussian(std=0.01) ZERO_INIT = Constant(0) LEARNING_RATE = 1e-4 BETA1 = 0.5 def create_model_brick(): layers = [ conv_brick(4, 2, 64), bn_brick(), LeakyRectifier(), conv_brick(4, 1, 64), bn_brick(), LeakyRectifier(), conv_brick(4, 2, 128), bn_brick(),
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') embedding_size = 300 glove_version = "glove.6B.300d.txt" #embedding_size = 50 #glove_version = "vectors.6B.50d.txt" o = x.sum(axis=1) + m.mean() * 0 score_layer = Linear( input_dim = 300, output_dim = 1, weights_init = IsotropicGaussian(std=0.02), biases_init = Constant(0.), name="linear2") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cg.outputs[0], params=params, step_rule = CompositeRule([ StepClipping(threshold=4), AdaM(), ]) ) # ======== print "setting up data" ports = { 'gpu0_train' : 5557, 'gpu0_test' : 5558, 'gpu1_train' : 5559, 'gpu1_test' : 5560, } #batch_size = 16 batch_size = 16 def start_server(port, which_set): fuel.server.logger.setLevel('WARN') dataset = IMDBText(which_set, sorted=True) n_train = dataset.num_examples #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size) scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size) stream = DataStream( dataset=dataset, iteration_scheme=scheme) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, mask_sources=('features',) ) fuel.server.start_server(padded, port=port, hwm=20) train_port = ports[theano.config.device + '_train'] train_p = Process(target=start_server, args=(train_port, 'train')) train_p.start() test_port = ports[theano.config.device + '_test'] test_p = Process(target=start_server, args=(test_port, 'test')) test_p.start() train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port) test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port) print "setting up model" n_examples = 25000 #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [ cost, misclassification, ], prefix='train', after_epoch=True )) #extensions.append(DataStreamMonitoring( #[cost, misclassification], #data_stream=test_stream, #prefix='test', #after_epoch=True #)) extensions.append(Timing()) extensions.append(Printing()) extensions.append(Plot( theano.config.device+"_result", channels=[['train_cost']], after_epoch=True )) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(config, tr_stream, dev_stream, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') initial_context = tensor.matrix('initial_context') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = InitialContextDecoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['context_dim']) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, initial_context) cost.name = 'decoder_cost' logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_parameters().values() enc_params += Selector(encoder.fwd_fork).get_parameters().values() enc_params += Selector(encoder.back_fork).get_parameters().values() dec_params = Selector( decoder.sequence_generator.readout).get_parameters().values() dec_params += Selector( decoder.sequence_generator.fork).get_parameters().values() dec_params += Selector(decoder.transition.initial_transformer).get_parameters().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff']) # TODO: weight noise for recurrent params isn't currently implemented -- see config['weight_noise_rec'] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions # TODO: add checking for existing model and loading logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Create the theano variables that we need for the sampling graph sampling_input = tensor.lmatrix('input') sampling_context = tensor.matrix('context_input') # WORKING: change this part to account for the new initial context for decoder # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation, sampling_context) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling # TODO: currently commented because we need to modify the sampler to use the contexts if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab=source_vocab, trg_vocab=target_vocab, src_vocab_size=config['src_vocab_size'], )) # TODO: add sampling_context to BleuValidator and Sampler # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, sampling_context, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[['decoder_cost', 'validation_set_bleu_score']], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent( cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) else: algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # enrich the logged information extensions.append( Timing(every_n_batches=100) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP fwd attention_mlp_fwd = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_fwd') attention_qlinear_fwd = Linear( input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_fwd') attention_clinear_fwd = Linear( input_dim=cenc_dim / 2, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_fwd') bricks += [ attention_mlp_fwd, attention_qlinear_fwd, attention_clinear_fwd ] layer1_fwd = Tanh(name='tanh_fwd') layer1_fwd = layer1_fwd.apply( attention_clinear_fwd.apply(cenc[:, :, :cenc_dim / 2].reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2] / 2))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear_fwd.apply(qenc)[None, :, :]) att_weights_fwd = attention_mlp_fwd.apply( layer1_fwd.reshape((layer1_fwd.shape[0] * layer1_fwd.shape[1], layer1_fwd.shape[2]))) att_weights_fwd = att_weights_fwd.reshape( (layer1_fwd.shape[0], layer1_fwd.shape[1])) att_weights_fwd = tensor.nnet.softmax(att_weights_fwd.T) att_weights_fwd.name = 'att_weights_fwd' attended_fwd = tensor.sum(cenc[:, :, :cenc_dim / 2] * att_weights_fwd.T[:, :, None], axis=0) attended_fwd.name = 'attended_fwd' # Attention mechanism MLP bwd attention_mlp_bwd = MLP( dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp_bwd') attention_qlinear_bwd = Linear( input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq_bwd') attention_clinear_bwd = Linear( input_dim=cenc_dim / 2, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc_bwd') bricks += [ attention_mlp_bwd, attention_qlinear_bwd, attention_clinear_bwd ] layer1_bwd = Tanh(name='tanh_bwd') layer1_bwd = layer1_bwd.apply( attention_clinear_bwd.apply(cenc[:, :, cenc_dim / 2:].reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2] / 2))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear_bwd.apply(qenc)[None, :, :]) att_weights_bwd = attention_mlp_bwd.apply( layer1_bwd.reshape((layer1_bwd.shape[0] * layer1_bwd.shape[1], layer1_bwd.shape[2]))) att_weights_bwd = att_weights_bwd.reshape( (layer1_bwd.shape[0], layer1_bwd.shape[1])) att_weights_bwd = tensor.nnet.softmax(att_weights_bwd.T) att_weights_bwd.name = 'att_weights_bwd' attended_bwd = tensor.sum(cenc[:, :, cenc_dim / 2:] * att_weights_bwd.T[:, :, None], axis=0) attended_bwd.name = 'attended_bwd' ctx_question = tensor.concatenate([attended_fwd, attended_bwd, qenc], axis=1) ctx_question.name = 'ctx_question' answer_bag = to_bag(answer, vocab_size) answer_bag = tensor.set_subtensor(answer_bag[:, 0:3], 0) relevant_items = answer_bag.sum(axis=1, dtype=theano.config.floatX) def createSequences(j, index, c_enc, c_enc_dim, c_context, c_window_size): sequence = tensor.concatenate([ c_context[j:j + index, :], tensor.zeros((c_window_size - index, c_context.shape[1])) ], axis=0) enc = tensor.concatenate([ c_enc[j + index - 1, :, :], c_enc[j, :, :-1], tensor.tile(c_window_size[None, None], (c_enc.shape[1], 1)) ], axis=1) return enc, sequence def createTargetValues(j, index, c_context, c_vocab_size): sequence_bag = to_bag(c_context[j:j + index, :], c_vocab_size) sequence_bag = tensor.set_subtensor(sequence_bag[:, 0:3], 0) selected_items = sequence_bag.sum(axis=1, dtype=theano.config.floatX) tp = (sequence_bag * answer_bag).sum(axis=1, dtype=theano.config.floatX) precision = tp / (selected_items + 0.00001) recall = tp / (relevant_items + 0.00001) #precision = tensor.set_subtensor(precision[tensor.isnan(precision)], 0.0) #recall = tensor.set_subtensor(recall[tensor.isnan(recall)], 1.0) macroF1 = (2 * (precision * recall)) / (precision + recall + 0.00001) #macroF1 = tensor.set_subtensor(macroF1[tensor.isnan(macroF1)], 0.0) return macroF1 window_size = 3 senc = [] sequences = [] pred_targets = [] for i in range(1, window_size + 1): (all_enc, all_sequence), _ = theano.scan( fn=createSequences, sequences=tensor.arange(cenc.shape[0] - i + 1), non_sequences=[i, cenc, cenc_dim, context, window_size]) (all_macroF1), _ = theano.scan( fn=createTargetValues, sequences=tensor.arange(cenc.shape[0] - i + 1), non_sequences=[i, context, vocab_size]) senc.append(all_enc) sequences.append(all_sequence) pred_targets.append(all_macroF1) senc = tensor.concatenate(senc, axis=0) sequences = tensor.concatenate(sequences, axis=0) pred_targets = tensor.concatenate(pred_targets, axis=0) # F1 prediction Bilinear prediction_linear = Linear(input_dim=2 * cenc_dim, output_dim=cenc_dim + qenc_dim, name='pred_linear') bricks += [prediction_linear] pred_weights = ctx_question[None, :, :] * prediction_linear.apply( senc.reshape( (senc.shape[0] * senc.shape[1], senc.shape[2]))).reshape( (senc.shape[0], senc.shape[1], senc.shape[2])) pred_weights = pred_weights.sum(axis=2) pred_weights = tensor.nnet.sigmoid(pred_weights.T).T pred_weights.name = 'pred_weights' pred_targets = pred_targets / (pred_targets.sum(axis=0) + 0.00001) pred_weights = pred_weights / (pred_weights.sum(axis=0) + 0.00001) #numpy.set_printoptions(edgeitems=500) #pred_targets = theano.printing.Print('pred_targets')(pred_targets) #pred_weights = theano.printing.Print('pred_weights')(pred_weights) cost = tensor.nnet.binary_crossentropy(pred_weights, pred_targets).mean() self.predictions = sequences[pred_weights.argmax(axis=0), :, tensor.arange(sequences.shape[2])].T # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self): inp = tensor.tensor3('input')[:, :, 0] target = tensor.matrix('target') target = target.reshape((target.shape[0], )) product = tensor.lvector('product') missing = tensor.eq(inp, 0) train_input_mean = 1470614.1 train_input_std = 3256577.0 trans_1 = tensor.concatenate( (inp[:, 1:], tensor.zeros((inp.shape[0], 1))), axis=1) trans_2 = tensor.concatenate((tensor.zeros( (inp.shape[0], 1)), inp[:, :-1]), axis=1) inp = tensor.switch(missing, (trans_1 + trans_2) / 2, inp) lookup = LookupTable(length=352, dim=hidden_dim) product_embed = lookup.apply(product) salut = tensor.concatenate((inp, missing), axis=1) linear = Linear(input_dim=108, output_dim=hidden_dim, name="MLP_in") inter = linear.apply(salut) inter = inter + product_embed mlp1 = MLP(activations=[Rectifier(), Rectifier(), Rectifier()], dims=[hidden_dim, hidden_dim, hidden_dim, hidden_dim], name="premier") inter1 = mlp1.apply(inter) linear2 = Linear(input_dim=hidden_dim, output_dim=out_dim, name="ouput_linear") pred = linear2.apply(inter1) * train_input_std + train_input_mean pred = pred.reshape((product.shape[0], )) cost = tensor.mean(abs((pred - target) / target)) # Initialize all bricks for brick in [linear, linear2, mlp1, lookup]: brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() # Apply noise and dropout cg = ComputationGraph([cost]) if w_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, w_noise_std) if i_dropout > 0: cg = apply_dropout(cg, [inter1], i_dropout) [cost_reg] = cg.outputs cost_reg += 1e-20 if cost_reg is not cost: self.cost = cost self.cost_reg = cost_reg cost_reg.name = 'cost_reg' cost.name = 'cost' self.sgd_cost = cost_reg self.monitor_vars = [[cost, cost_reg]] else: self.cost = cost cost.name = 'cost' self.sgd_cost = cost self.monitor_vars = [[cost]] self.pred = pred pred.name = 'pred'
import os import cPickle from blocks.initialization import IsotropicGaussian, Constant import data from model.rnn_tgtcls import Model, Stream class EmbedderConfig(object): __slots__ = ('dim_embeddings', 'embed_weights_init') pre_embedder = EmbedderConfig() pre_embedder.embed_weights_init = IsotropicGaussian(0.001) pre_embedder.dim_embeddings = [ ('week_of_year', 52, 10), ('day_of_week', 7, 10), ('qhour_of_day', 24 * 4, 10), ('day_type', 3, 10), ('taxi_id', 448, 10), ] post_embedder = EmbedderConfig() post_embedder.embed_weights_init = IsotropicGaussian(0.001) post_embedder.dim_embeddings = [ ('origin_call', data.origin_call_train_size, 10), ('origin_stand', data.stands_size, 10), ] with open(os.path.join(data.path, 'arrival-clusters.pkl')) as f:
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating simple 1d sequences with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "plot"], help="The mode to run. Use `train` to train a new model" " and `plot` to plot a sequence generated by an" " existing one.") parser.add_argument("prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument("--input-noise", type=float, default=0.0, help="Adds Gaussian noise of given intensity to the " " training sequences.") parser.add_argument( "--function", default="lambda a, x: numpy.sin(a * x)", help="An analytical description of the sequence family to learn." " The arguments before the last one are considered parameters.") parser.add_argument("--steps", type=int, default=100, help="Number of steps to plot") parser.add_argument("--params", help="Parameter values for plotting") args = parser.parse_args() function = eval(args.function) num_params = len(inspect.getargspec(function).args) - 1 class Emitter(TrivialEmitter): @application def cost(self, readouts, outputs): """Compute MSE.""" return ((readouts - outputs)**2).sum(axis=readouts.ndim - 1) transition = GatedRecurrent(name="transition", activation=Tanh(), dim=10, weights_init=Orthogonal()) with_params = AddParameters(transition, num_params, "params", name="with_params") generator = SequenceGenerator(LinearReadout( readout_dim=1, source_names=["states"], emitter=Emitter(name="emitter"), name="readout"), with_params, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat(Selector(generator).get_params().keys())) if args.mode == "train": seed = 1 rng = numpy.random.RandomState(seed) batch_size = 10 generator.initialize() cost = Cost( generator.cost(tensor.tensor3('x'), params=tensor.matrix("params")).sum()) if args.input_noise: cost.apply_noise(cost.inputs, args.input_noise) gh_model = GroundhogModel(generator, cost) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() data = SeriesIterator(rng, function, 100, batch_size) trainer = SGD(gh_model, state, data) main_loop = MainLoop(data, None, None, gh_model, trainer, state, None) main_loop.load() main_loop.main() elif args.mode == "plot": load_params(generator, args.prefix + "model.npz") params = tensor.matrix("params") sample = theano.function([params], generator.generate(params=params, n_steps=args.steps, batch_size=1)) param_values = numpy.array(map(float, args.params.split()), dtype=floatX) states, outputs, _ = sample(param_values[None, :]) actual = outputs[:, 0, 0] desired = numpy.array( [function(*(list(param_values) + [T])) for T in range(args.steps)]) print("MSE: {}".format(((actual - desired)**2).sum())) pyplot.plot(numpy.hstack([actual[:, None], desired[:, None]])) pyplot.show() else: assert False
n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory with open(os.path.join(data.path, 'arrival-clusters.pkl')) as f: tgtcls = cPickle.load(f) dim_embeddings = [ ('origin_call', data.origin_call_train_size, 10), ('origin_stand', data.stands_size, 10), ('week_of_year', 52, 10), ('day_of_week', 7, 10), ('qhour_of_day', 24 * 4, 10), ('day_type', 3, 10), ('taxi_id', 448, 10), ] dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) dim_hidden = [100, 100] dim_output = tgtcls.shape[0] embed_weights_init = IsotropicGaussian(0.01) mlp_weights_init = IsotropicGaussian(0.1) mlp_biases_init = Constant(0.01) step_rule = Momentum(learning_rate=0.01, momentum=0.9) batch_size = 200 valid_set = 'cuts/test_times_0' max_splits = 100
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim): x_dim = 28 * 28 img_height, img_width = (28, 28) rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e" % value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) name = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (name, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel(n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_valid = BinarizedMNIST("valid", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) train_stream = DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size)) valid_stream = DataStream(mnist_valid, iteration_scheme=SequentialScheme( mnist_valid.num_examples, batch_size)) test_stream = DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size)) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), Checkpoint(name + ".pkl", save_separately=['log', 'model']), #Dump(name), Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def main(config, tr_stream, dev_stream, source_vocab, target_vocab, use_bokeh=False): # add the tags from this function to the IMT datastream # prediction function signature # [target_suffix, source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask] prediction_function = get_prediction_function(exp_config=config) tr_stream = Mapping( tr_stream, CallPredictionFunctionOnStream(prediction_function, [1, 0, 5, 4, 7, 6]), #tr_stream = Mapping(tr_stream, CallFunctionOnStream(prediction_function, [6, 1, 0, 5, 4, 7]), add_sources=('predictions', 'orig_readouts', 'prediction_tags')) # now datastream has 11 things import ipdb ipdb.set_trace() # WORKING: call prediction function twice to get new readouts on predictions instead of reference suffs # the only difference is the index of the suffix tr_stream = Mapping(tr_stream, CallPredictionFunctionOnStream(prediction_function, [1, 0, 5, 4, 7, 8]), add_sources=('dummy_predictions', 'readouts', 'dummy_prediction_tags')) import ipdb ipdb.set_trace() # Create the prediction confidence model # the first draft of this model uses the readout output (before the post-merge step) as the per-timestep state vector # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference target_sentence = tensor.lmatrix('target_suffix') target_sentence_mask = tensor.matrix('target_suffix_mask') target_prefix = tensor.lmatrix('target_prefix') target_prefix_mask = tensor.matrix('target_prefix_mask') # symbolic variable which tags each timestep as GOOD/BAD # Note: later this might be tags for a hypothesis i.e. from TER(p), right now the timesteps are actually determined by the reference # By zipping the confidence model output with the reference, we get the model's confidence that this reference word # will be predicted correctly prediction_tags = tensor.matrix('prediction_tags') readouts = tensor.tensor3('readouts') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = NMTPrefixDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, loss_function='cross_entropy') # rename to match baseline NMT systems decoder.name = 'decoder' cost = decoder.confidence_cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, target_prefix, target_prefix_mask, readouts, prediction_tags) # WORKING: add l2 regularization logger.info('Creating computational graph') # working: implement cost for confidence model cg = ComputationGraph(cost) # INITIALIZATION logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() #cost_cg = ComputationGraph(cost) if config['l2_reg']: l2_reg_alpha = config['l2_reg_alpha'] model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # do we need to name the cost variable again? cost.name = 'cost' cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name in set([ 'confidence_model1_apply_output', 'confidence_model2_apply_output', 'confidence_model3_apply_output' ]) ] # if x.name == 'maxout_apply_output'] # if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # WORKING: implement confidence -- remove all params except output model cost_model = Model(cost) model_params = cost_model.get_parameter_dict() trainable_params = cg.parameters import ipdb ipdb.set_trace() print('trainable params') #params_to_remove = [model_params[k] for k in model_params.keys() if 'confidence' not in k] #for p in params_to_remove: # trainable_params.remove(p) # target_embeddings = model.get_parameter_dict()['/target_recurrent_lm_with_alignments/target_embeddings.W'] # trainable_params.remove(source_embeddings) # trainable_params.remove(target_embeddings) # END WORKING: implement confidence -- remove all params except output model # TODO: fixed dropout mask for recurrent params? # Print shapes # shapes = [param.get_value().shape for param in cg.parameters] # logger.info("Parameter shapes: ") # for shape, count in Counter(shapes).most_common(): # logger.info(' {:15}: {}'.format(shape, count)) # logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names # enc_dec_param_dict = merge(Selector(encoder).get_parameters(), # Selector(decoder).get_parameters()) # logger.info("Parameter names: ") # for name, value in enc_dec_param_dict.items(): # logger.info(' {:15}: {}'.format(value.get_value().shape, name)) # logger.info("Total number of parameters: {}" # .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), # Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # WORKING: confidence prediction #monitor everything that could possibly be relevant # Set up the sampling graph for validation during training # Theano variables for the sampling graph # Note this also loads the model parameters sampling_vars = load_params_and_get_beam_search(config, encoder=encoder, decoder=decoder) beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab=source_vocab, # trg_vocab=target_vocab, # src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu #if config['bleu_script'] is not None: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, sampling_prefix, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # src_vocab=source_vocab, # trg_vocab=target_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # TODO: add first-word accuracy validation # TODO: add IMT meteor early stopping #if config.get('imt_f1_validation', None) is not None: # logger.info("Building imt F1 validator") # extensions.append( # IMT_F1_Validator(sampling_input, sampling_prefix, # samples=samples, # config=config, # model=search_model, data_stream=dev_stream, # src_vocab=source_vocab, # trg_vocab=target_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # TODO: hacking here: get the predictions of the confidence model using the `readouts` source of the data_stream # Note that the parameters of this model must be pretrained, otherwise this doesn't make sense # confidence_predictions = decoder.get_confidence(readouts) # confidence_prediction_model = Model(confidence_predictions) # # confidence_param_values = LoadNMT.load_parameter_values(config['confidence_saved_parameters'], brick_delimiter=None) # LoadNMT.set_model_parameters(confidence_prediction_model, confidence_param_values) # # confidence_prediction_func = confidence_prediction_model.get_theano_function() # import ipdb; ipdb.set_trace() # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( # Plot(config['model_save_directory'], channels=[['decoder_confidence_cost_cost']], Plot(config['model_save_directory'], channels=[['cost']], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # WORKING: implement confidence model # if there is dropout or random noise, we need to use the output of the modified graph algorithm = GradientDescent( cost=cg.outputs[0], parameters=trainable_params, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), # eval(config['step_rule'])(), RemoveNotFinite()]), # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), on_unused_sources='warn') #if config['dropout'] < 1.0: # algorithm = GradientDescent( # cost=cg.outputs[0], parameters=trainable_params, # step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])(), RemoveNotFinite()]), # # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), # on_unused_sources='warn' # ) #else: # algorithm = GradientDescent( # cost=cost, parameters=cg.parameters, # step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])()]), # on_unused_sources='warn' # ) # END WORKING: implement confidence model import ipdb ipdb.set_trace() # enrich the logged information extensions.append(Timing(every_n_batches=100)) # WORKING: debugging confidence # get theano function from model # WORKING: implement word-level confidence cost # @application(inputs=['representation', 'source_sentence_mask', # 'target_sentence_mask', 'target_sentence', 'target_prefix_mask', 'target_prefix'], # outputs=['cost']) # def confidence_cost(self, representation, source_sentence_mask, # target_sentence, target_sentence_mask, target_prefix, target_prefix_mask): logger.info('Creating theano variables') # WORKING: 26.9.16 -- get confidence outputs directly from (source, prefix, suffix) inputs # This is equivalent to forced alignment --> confidence scores # Note: but this section should probably be in "evaluate" mode, not here in "train" # source_sentence = tensor.lmatrix('source') # source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference #target_sentence = tensor.lmatrix('target_suffix') #target_sentence_mask = tensor.matrix('target_suffix_mask') # TODO: change names back to *_suffix, there is currently a theano function name error # TODO: in the GradientDescent Algorithm #target_prefix = tensor.lmatrix('target_prefix') #target_prefix_mask = tensor.matrix('target_prefix_mask') # confidence_output = decoder.confidence_cost( # encoder.apply(source_sentence, source_sentence_mask), # source_sentence_mask, target_sentence, target_sentence_mask, # target_prefix, target_prefix_mask) # confidence_model = Model(confidence_output) # t_cost_func = confidence_model.get_theano_function() # inputs # [source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask, target_suffix] #import ipdb;ipdb.set_trace() # get the right args from the datastream # TODO: just print source, prefix, suffix, prediction, correct to new files -- this makes sure everything is aligned # OUTPUT_DIR = '/media/1tb_drive/imt_models/word_prediction_accuracy_experiments/en-de/exp_1' # for the_file in os.listdir(OUTPUT_DIR): # file_path = os.path.join(OUTPUT_DIR, the_file) # try: # if os.path.isfile(file_path): # os.unlink(file_path) # except Exception as e: # print(e) # # def write_file_truncate_mask(filename, data, mask, mode='a'): # ''' data is list of list ''' # # assert len(data) == len(mask) # with codecs.open(filename, mode, encoding='utf8') as out: # for l, m in zip(data, mask): # output = u' '.join(l[:int(m.sum())]) + u'\n' # out.write(output) # logger.info('Wrote file: {}'.format(filename)) # # # target_ivocab = {k:v.decode('utf8') for v,k in target_vocab.items()} # source_ivocab = {k:v.decode('utf8') for v,k in source_vocab.items()} # import ipdb; ipdb.set_trace() # tag_ivocab = {1: 'True', 0: 'False'} # # test_iter = tr_stream.get_epoch_iterator() # it = 0 # for t_source, t_source_mask, t_target, t_target_mask, t_target_prefix, t_target_prefix_mask, t_target_suffix, t_target_suffix_mask in test_iter: # if it <= 1000: # it += 1 # t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix) # readouts = t_cost[0] # preds = readouts.argmax(axis=2) # correct = preds.T == t_target_suffix # # # source_output = os.path.join(OUTPUT_DIR,'sources.en') # prefix_output = os.path.join(OUTPUT_DIR,'prefixes.de') # suffix_output = os.path.join(OUTPUT_DIR,'suffixes.de') # prediction_output = os.path.join(OUTPUT_DIR,'predictions.de') # correct_output = os.path.join(OUTPUT_DIR,'prefix_word_prediction_acc.out') # # source_text = [[source_ivocab[w] for w in s] for s in t_source] # prefix_text = [[target_ivocab[w] for w in s] for s in t_target_prefix] # suffix_text = [[target_ivocab[w] for w in s] for s in t_target_suffix] # pred_text = [[target_ivocab[w] for w in s] for s in preds.T] # correct_text = [[tag_ivocab[w] for w in s] for s in correct] # # # for triple in zip([source_output, prefix_output, suffix_output, prediction_output, correct_output], # [source_text, prefix_text, suffix_text, pred_text, correct_text], # [t_source_mask, t_target_prefix_mask, t_target_suffix_mask, t_target_suffix_mask, t_target_suffix_mask]): # write_file_truncate_mask(*triple) # else: # break # # import ipdb; ipdb.set_trace() #t_cost = t_cost_func(t_source, t_target_prefix) #t_cost = t_cost_func(t_target_suffix, t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask) #t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix) # return confidence_cost, flat_y, confidence_logits, readouts #predictions = t_cost[0].argmax(axis=2) # TODO: next step -- print gradients and weights during training find out where nan is coming from # TODO: look at the gradient of this function with respect to parameters? -- see here: http://deeplearning.net/software/theano/tutorial/gradients.html # TODO: function which adds right/wrong tags for model predictions to the datastream. In this case we can learn a simple linear model as a baseline # TODO: print predictions for each batch for each timestep to file -- _dont shuffle_ so that we get the right order # import ipdb;ipdb.set_trace() # from blocks reverse_words example # observables = [ # cost, min_energy, max_energy, mean_activation, # batch_size, max_length, cost_per_character, # algorithm.total_step_norm, algorithm.total_gradient_norm] # for name, parameter in trainable_params.items(): # observables.append(parameter.norm(2).copy(name + "_norm")) # observables.append(algorithm.gradients[parameter].norm(2).copy( # name + "_grad_norm")) for i, (k, v) in enumerate(algorithm.updates): v.name = k.name + '_{}'.format(i) aux_vars = [v for v in cg.auxiliary_variables[-3:]] # import ipdb; ipdb.set_trace() extensions.extend([ TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring([v for k,v in algorithm.updates[:2]], after_batch=True), # TrainingDataMonitoring(aux_vars, after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), Printing(after_batch=True) ]) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) import ipdb ipdb.set_trace() # Train! main_loop.run()
def create_model_bricks(image_size, depth): # original celebA64 was depth=3 (went to bach_norm6) layers = [] if(depth > 0): layers = layers + [ Convolutional( filter_size=(4, 4), num_filters=32, name='conv1'), SpatialBatchNormalization(name='batch_norm1'), Rectifier(), Convolutional( filter_size=(3, 3), step=(2, 2), num_filters=32, name='conv2'), SpatialBatchNormalization(name='batch_norm2'), Rectifier(), ] if(depth > 1): layers = layers + [ Convolutional( filter_size=(4, 4), num_filters=64, name='conv3'), SpatialBatchNormalization(name='batch_norm3'), Rectifier(), Convolutional( filter_size=(3, 3), step=(2, 2), num_filters=64, name='conv4'), SpatialBatchNormalization(name='batch_norm4'), Rectifier(), ] if(depth > 2): layers = layers + [ Convolutional( filter_size=(3, 3), num_filters=128, name='conv5'), SpatialBatchNormalization(name='batch_norm5'), Rectifier(), Convolutional( filter_size=(3, 3), step=(2, 2), num_filters=128, name='conv6'), SpatialBatchNormalization(name='batch_norm6'), Rectifier(), ] if(depth > 3): layers = layers + [ Convolutional( filter_size=(3, 3), num_filters=256, name='conv7'), SpatialBatchNormalization(name='batch_norm7'), Rectifier(), Convolutional( filter_size=(3, 3), step=(2, 2), num_filters=256, name='conv8'), SpatialBatchNormalization(name='batch_norm8'), Rectifier(), ] if(depth > 4): layers = layers + [ Convolutional( filter_size=(3, 3), num_filters=512, name='conv9'), SpatialBatchNormalization(name='batch_norm9'), Rectifier(), Convolutional( filter_size=(3, 3), step=(2, 2), num_filters=512, name='conv10'), SpatialBatchNormalization(name='batch_norm10'), Rectifier(), ] if(depth > 5): layers = layers + [ Convolutional( filter_size=(3, 3), num_filters=512, name='conv11'), SpatialBatchNormalization(name='batch_norm11'), Rectifier(), Convolutional( filter_size=(3, 3), step=(2, 2), num_filters=512, name='conv12'), SpatialBatchNormalization(name='batch_norm12'), Rectifier(), ] if(depth > 6): layers = layers + [ Convolutional( filter_size=(3, 3), num_filters=512, name='conv13'), SpatialBatchNormalization(name='batch_norm13'), Rectifier(), Convolutional( filter_size=(3, 3), step=(2, 2), num_filters=512, name='conv14'), SpatialBatchNormalization(name='batch_norm14'), Rectifier(), ] if(depth > 7): layers = layers + [ Convolutional( filter_size=(3, 3), num_filters=512, name='conv15'), SpatialBatchNormalization(name='batch_norm15'), Rectifier(), Convolutional( filter_size=(3, 3), step=(2, 2), num_filters=512, name='conv16'), SpatialBatchNormalization(name='batch_norm16'), Rectifier(), ] print("creating model of depth {} with {} layers".format(depth, len(layers))) convnet = ConvolutionalSequence( layers=layers, num_channels=3, image_size=(image_size, image_size), use_bias=False, weights_init=IsotropicGaussian(0.033), biases_init=Constant(0), name='convnet') convnet.initialize() mlp = BatchNormalizedMLP( activations=[Rectifier(), Logistic()], dims=[numpy.prod(convnet.get_dim('output')), 1000, 64], weights_init=IsotropicGaussian(0.033), biases_init=Constant(0), name='mlp') mlp.initialize() return convnet, mlp, len(layers)
def check_gaussian(rng, mean, std, shape): weights = IsotropicGaussian(std, mean).generate(rng, shape) assert weights.shape == shape assert weights.dtype == theano.config.floatX assert_allclose(weights.mean(), mean, atol=1e-2) assert_allclose(weights.std(), std, atol=1e-2)
def test_integer_sequence_generator(): """Test a sequence generator with integer outputs. Such sequence generators can be used to e.g. model language. """ floatX = theano.config.floatX rng = numpy.random.RandomState(1234) readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) generator = SequenceGenerator(Readout( readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, feedback_dim)), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.lmatrix('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 costs_fun = theano.function([y, mask], [costs]) y_test = rng.randint(readout_dim, size=(n_steps, batch_size)) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = costs_fun(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 482.827, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 16.0942, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join( [generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [ el for el in var_filter(cg.variables) if el.name == aux_var_name ][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 1.60942, rtol=1e-5) # Test generate states, outputs, costs = generator.generate(iterate=True, batch_size=batch_size, n_steps=n_steps) cg = ComputationGraph(states + outputs + costs) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=cg.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size) assert_allclose(states_val.sum(), -17.854, rtol=1e-5) assert_allclose(costs_val.sum(), 482.868, rtol=1e-5) assert outputs_val.sum() == 629 # Test masks agnostic results of cost cost1 = costs_fun([[1], [2]], [[1], [1]])[0] cost2 = costs_fun([[3, 1], [4, 2], [2, 0]], [[1, 1], [1, 1], [1, 0]])[0] assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
def __init__(self, std=1, mean=0): self.gaussian_init = IsotropicGaussian(std = std, mean = mean) self.identity = Identity()
def test_with_attention(): """Test a sequence generator with continuous outputs and attention.""" rng = numpy.random.RandomState(1234) inp_dim = 2 inp_len = 10 attended_dim = 3 attended_len = 11 batch_size = 4 n_steps = 30 # For values def rand(size): return rng.uniform(size=size).astype(theano.config.floatX) # For masks def generate_mask(length, batch_size): mask = numpy.ones((length, batch_size), dtype=theano.config.floatX) # To make it look like read data for i in range(batch_size): mask[1 + rng.randint(0, length - 1):, i] = 0.0 return mask output_vals = rand((inp_len, batch_size, inp_dim)) output_mask_vals = generate_mask(inp_len, batch_size) attended_vals = rand((attended_len, batch_size, attended_dim)) attended_mask_vals = generate_mask(attended_len, batch_size) transition = TestTransition(dim=inp_dim, attended_dim=attended_dim, activation=Identity()) attention = SequenceContentAttention(state_names=transition.apply.states, match_dim=inp_dim) generator = SequenceGenerator(Readout( readout_dim=inp_dim, source_names=[ transition.apply.states[0], attention.take_glimpses.outputs[0] ], emitter=TestEmitter()), transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), add_contexts=False, seed=1234) generator.initialize() # Test 'cost_matrix' method attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") outputs = tensor.tensor3('outputs') mask = tensor.matrix('mask') costs = generator.cost_matrix(outputs, mask, attended=attended, attended_mask=attended_mask) costs_vals = costs.eval({ outputs: output_vals, mask: output_mask_vals, attended: attended_vals, attended_mask: attended_mask_vals }) assert costs_vals.shape == (inp_len, batch_size) assert_allclose(costs_vals.sum(), 13.5042, rtol=1e-5) # Test `generate` method results = (generator.generate(n_steps=n_steps, batch_size=attended.shape[1], attended=attended, attended_mask=attended_mask)) assert len(results) == 5 states_vals, outputs_vals, glimpses_vals, weights_vals, costs_vals = ( theano.function([attended, attended_mask], results)(attended_vals, attended_mask_vals)) assert states_vals.shape == (n_steps, batch_size, inp_dim) assert states_vals.shape == outputs_vals.shape assert glimpses_vals.shape == (n_steps, batch_size, attended_dim) assert weights_vals.shape == (n_steps, batch_size, attended_len) assert costs_vals.shape == (n_steps, batch_size) assert_allclose(states_vals.sum(), 23.4172, rtol=1e-5) # There is no generation cost in this case, since generation is # deterministic assert_allclose(costs_vals.sum(), 0.0, rtol=1e-5) assert_allclose(weights_vals.sum(), 120.0, rtol=1e-5) assert_allclose(glimpses_vals.sum(), 199.2402, rtol=1e-5) assert_allclose(outputs_vals.sum(), -11.6008, rtol=1e-5)