def main(mode, save_path, num_batches, data_path=None): # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Build bricks encoder = Bidirectional(SimpleRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) fork = Fork( [name for name in encoder.prototype.apply.sequences if name != 'mask'], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.output_dims = {name: dimension for name in fork.input_names} lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) transition = SimpleRecurrent(activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention(state_names=transition.apply.states, sequence_dim=2 * dimension, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = DataStreamMapping( mapping=_transpose, data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=_filter_long, data_stream=dataset.get_default_stream()))))) # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) (energies, ) = VariableFilter(application=readout.readout, name="output")(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy( abs(activations).mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule( [StepClipping(10.0), Scale(0.01)])) # More variables for debugging observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring(observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) model = Model(generated) model.set_param_values(load_parameter_values(save_path)) sample_function = model.get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def main(model, cost, config, tr_stream, dev_stream, use_bokeh=False): # Set the parameters from a trained models (.npz file) logger.info("Loading parameters from model: {}".format( exp_config['saved_parameters'])) # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API param_values = LoadNMT.load_parameter_values( exp_config['saved_parameters'], brick_delimiter=exp_config.get('brick_delimiter', None)) LoadNMT.set_model_parameters(model, param_values) logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING if config.get('l2_regularization', False) is True: l2_reg_alpha = config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to rename the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization # Note dropout variables are hard-coded here if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) # TODO: mv the actual config file once we switch to .yaml for min-risk # shutil.copy(config['config_file'], config['saveto']) # shutil.copy(config['config_file'], config['saveto']) with codecs.open(os.path.join(config['saveto'], 'config.yaml'), 'w', encoding='utf8') as yaml_out: yaml_out.write(yaml.dump(config)) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary # TODO: change the if here if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = train_encoder.apply( theano_sampling_input, tensor.ones(theano_sampling_input.shape)) # TODO: the generated output actually contains several more values, ipdb to see what they are generated = train_decoder.generate(theano_sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[train_decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling -- TODO: sampling is broken for min-risk #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config.get('bleu_script', None) is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(theano_sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Add early stopping based on bleu if config.get('meteor_directory', None) is not None: logger.info("Building meteor validator") extensions.append( MeteorValidator(theano_sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[['decoder_cost_cost'], ['validation_set_meteor_score']], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # TODO: reenable dropout and add L2 regularization as in the main config # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='warn') else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='warn') algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ], ), on_unused_sources='warn') # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def main(save_to, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500, num_batches=None): if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) cg = ComputationGraph([cost, error_rate]) mnist_train = MNIST(("train",)) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme( mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test",)) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme( mnist_test.num_examples, batch_size)) # Train with simple SGD algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [cost, error_rate], mnist_test_stream, prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing()] model = Model(cost) main_loop = MainLoop( algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.run()
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Get training and development set streams tr_stream = get_tr_stream(**config) dev_stream = get_dev_stream(**config) # Get cost of the model cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get test set stream test_stream = get_dev_stream(config['test_set'], config['src_vocab'], config['src_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens(pickle.load( open(config['trg_vocab'])), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={sampling_input: input_}, max_length=3*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info("Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close()
cg = ComputationGraph(cost) monitored = set([cost] + VariableFilter(roles=[roles.COST])(cg.variables)) valid_monitored = monitored if hasattr(model, 'valid_cost'): valid_cost = model.valid_cost(**inputs) valid_cg = ComputationGraph(valid_cost) valid_monitored = set([valid_cost] + VariableFilter( roles=[roles.COST])(valid_cg.variables)) if hasattr(config, 'dropout') and config.dropout < 1.0: cg = apply_dropout(cg, config.dropout_inputs(cg), config.dropout) if hasattr(config, 'noise') and config.noise > 0.0: cg = apply_noise(cg, config.noise_inputs(cg), config.noise) cost = cg.outputs[0] cg = Model(cost) logger.info('# Parameter shapes:') parameters_size = 0 for value in cg.parameters: logger.info(' %20s %s' % (value.get_value().shape, value.name)) parameters_size += reduce(operator.mul, value.get_value().shape, 1) logger.info('Total number of parameters: %d in %d matrices' % (parameters_size, len(cg.parameters))) if hasattr(config, 'step_rule'): step_rule = config.step_rule else: step_rule = AdaDelta() logger.info("Fuel seed: %d" % fuel.config.default_seed)
def train_snli_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed, model='simple'): if config['exclude_top_k'] > config['num_input_words'] and config[ 'num_input_words'] > 0: raise Exception("Some words have neither word nor def embedding") c = config logger = configure_logger(name="snli_baseline_training", log_file=os.path.join(save_path, "log.txt")) if not os.path.exists(save_path): logger.info("Start a new job") os.mkdir(save_path) else: logger.info("Continue an existing job") with open(os.path.join(save_path, "cmd.txt"), "w") as f: f.write(" ".join(sys.argv)) # Make data paths nice for path in [ 'dict_path', 'embedding_def_path', 'embedding_path', 'vocab', 'vocab_def', 'vocab_text' ]: if c.get(path, ''): if not os.path.isabs(c[path]): c[path] = os.path.join(fuel.config.data_path[0], c[path]) main_loop_path = os.path.join(save_path, 'main_loop.tar') main_loop_best_val_path = os.path.join(save_path, 'main_loop_best_val.tar') stream_path = os.path.join(save_path, 'stream.pkl') # Save config to save_path json.dump(config, open(os.path.join(save_path, "config.json"), "w")) if model == 'simple': nli_model, data, used_dict, used_retrieval, _ = _initialize_simple_model_and_data( c) elif model == 'esim': nli_model, data, used_dict, used_retrieval, _ = _initialize_esim_model_and_data( c) else: raise NotImplementedError() # Compute cost s1, s2 = T.lmatrix('sentence1'), T.lmatrix('sentence2') if c['dict_path']: assert os.path.exists(c['dict_path']) s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix( 'sentence2_def_map') def_mask = T.fmatrix("def_mask") defs = T.lmatrix("defs") else: s1_def_map, s2_def_map = None, None def_mask = None defs = None s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask') y = T.ivector('label') cg = {} for train_phase in [True, False]: # NOTE: Please don't change outputs of cg if train_phase: with batch_normalization(nli_model): pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) else: pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) cost = CategoricalCrossEntropy().apply(y.flatten(), pred) error_rate = MisclassificationRate().apply(y.flatten(), pred) cg[train_phase] = ComputationGraph([cost, error_rate]) # Weight decay (TODO: Make it less bug prone) if model == 'simple': weights_to_decay = VariableFilter( bricks=[dense for dense, relu, bn in nli_model._mlp], roles=[WEIGHT])(cg[True].variables) weight_decay = np.float32(c['l2']) * sum( (w**2).sum() for w in weights_to_decay) elif model == 'esim': weight_decay = 0.0 else: raise NotImplementedError() final_cost = cg[True].outputs[0] + weight_decay final_cost.name = 'final_cost' # Add updates for population parameters if c.get("bn", True): pop_updates = get_batch_normalization_updates(cg[True]) extra_updates = [(p, m * 0.1 + p * (1 - 0.1)) for p, m in pop_updates] else: pop_updates = [] extra_updates = [] if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: loaded_params = load_parameters(src) cg[True].set_parameter_values(loaded_params) for param, m in pop_updates: param.set_value(loaded_params[get_brick( param).get_hierarchical_name(param)]) if os.path.exists(os.path.join(save_path, "main_loop.tar")): logger.warning("Manually loading BN stats :(") with open(os.path.join(save_path, "main_loop.tar")) as src: loaded_params = load_parameters(src) for param, m in pop_updates: param.set_value( loaded_params[get_brick(param).get_hierarchical_name(param)]) if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', batch_size=4).get_epoch_iterator()) s1.tag.test_value = test_value_data[0] s1_mask.tag.test_value = test_value_data[1] s2.tag.test_value = test_value_data[2] s2_mask.tag.test_value = test_value_data[3] y.tag.test_value = test_value_data[4] # Freeze embeddings if not c['train_emb']: frozen_params = [ p for E in nli_model.get_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params) & set(train_params)) > 0 else: frozen_params = [] if not c.get('train_def_emb', 1): frozen_params_def = [ p for E in nli_model.get_def_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params_def) & set(train_params)) > 0 frozen_params += frozen_params_def train_params = [p for p in cg[True].parameters if p not in frozen_params] train_params_keys = [ get_brick(p).get_hierarchical_name(p) for p in train_params ] # Optimizer algorithm = GradientDescent(cost=final_cost, on_unused_sources='ignore', parameters=train_params, step_rule=Adam(learning_rate=c['lr'])) algorithm.add_updates(extra_updates) m = Model(final_cost) parameters = m.get_parameter_dict() # Blocks version mismatch logger.info("Trainable parameters" + "\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(train_params_keys)], width=120)) logger.info("# of parameters {}".format( sum([ np.prod(parameters[key].get_value().shape) for key in sorted(train_params_keys) ]))) ### Monitored args ### train_monitored_vars = [final_cost] + cg[True].outputs monitored_vars = cg[False].outputs val_acc = monitored_vars[1] to_monitor_names = [ 'def_unk_ratio', 's1_merged_input_rootmean2', 's1_def_mean_rootmean2', 's1_gate_rootmean2', 's1_compose_gate_rootmean2' ] for k in to_monitor_names: train_v, valid_v = VariableFilter(name=k)( cg[True]), VariableFilter(name=k)(cg[False]) if len(train_v): logger.info("Adding {} tracking".format(k)) train_monitored_vars.append(train_v[0]) monitored_vars.append(valid_v[0]) else: logger.warning("Didnt find {} in cg".format(k)) if c['monitor_parameters']: for name in train_params_keys: param = parameters[name] num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements grad_norm = algorithm.gradients[param].norm(2) / num_elements step_norm = algorithm.steps[param].norm(2) / num_elements stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' train_monitored_vars.append(stats) regular_training_stream = data.get_stream('train', batch_size=c['batch_size'], seed=seed) if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=regular_training_stream.sources, hwm=100, produces_examples=regular_training_stream.produces_examples) else: training_stream = regular_training_stream ### Build extensions ### extensions = [ # Load(main_loop_path, load_iteration_state=True, load_log=True) # .set_conditions(before_training=not new_training_job), StartFuelServer(regular_training_stream, stream_path, hwm=100, script_path=os.path.join( os.path.dirname(__file__), "../bin/start_fuel_server.py"), before_training=fuel_server), Timing(every_n_batches=c['mon_freq']), ProgressBar(), RetrievalPrintStats(retrieval=used_retrieval, every_n_batches=c['mon_freq_valid'], before_training=not fast_start), Timestamp(), TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq']), ] if c['layout'] == 'snli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid', batch_size=14, seed=seed), before_training=not fast_start, on_resumption=True, after_training=True, every_n_batches=c['mon_freq_valid'], prefix='valid') extensions.append(validation) elif c['layout'] == 'mnli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid_matched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], on_resumption=True, after_training=True, prefix='valid_matched') validation_mismatched = DataStreamMonitoring( monitored_vars, data.get_stream('valid_mismatched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], before_training=not fast_start, on_resumption=True, after_training=True, prefix='valid_mismatched') extensions.extend([validation, validation_mismatched]) else: raise NotImplementedError() # Similarity trackers for embeddings if len(c.get('vocab_def', '')): retrieval_vocab = Vocabulary(c['vocab_def']) else: retrieval_vocab = data.vocab retrieval_all = Retrieval(vocab_text=retrieval_vocab, dictionary=used_dict, max_def_length=c['max_def_length'], exclude_top_k=0, max_def_per_word=c['max_def_per_word']) for name in [ 's1_word_embeddings', 's1_dict_word_embeddings', 's1_translated_word_embeddings' ]: variables = VariableFilter(name=name)(cg[False]) if len(variables): s1_emb = variables[0] logger.info("Adding similarity tracking for " + name) # A bit sloppy about downcast if "dict" in name: embedder = construct_dict_embedder(theano.function( [s1, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True), vocab=data.vocab, retrieval=retrieval_all) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) else: embedder = construct_embedder(theano.function( [s1], s1_emb, allow_input_downcast=True), vocab=data.vocab) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) track_the_best = TrackTheBest(validation.record_name(val_acc), before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start, every_n_batches=c['mon_freq_valid'], choose_best=min) extensions.append(track_the_best) # Special care for serializing embeddings if len(c.get('embedding_path', '')) or len(c.get('embedding_def_path', '')): extensions.insert( 0, LoadNoUnpickling(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=train_params + [p for p, m in pop_updates], save_main_loop=False, save_separately=['log', 'iteration_state'], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) else: extensions.insert( 0, Load(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=cg[True].parameters + [p for p, m in pop_updates], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) extensions.extend([ DumpCSVSummaries(save_path, every_n_batches=c['mon_freq_valid'], after_training=True), DumpTensorflowSummaries(save_path, after_epoch=True, every_n_batches=c['mon_freq_valid'], after_training=True), Printing(every_n_batches=c['mon_freq_valid']), PrintMessage(msg="save_path={}".format(save_path), every_n_batches=c['mon_freq']), FinishAfter(after_n_batches=c['n_batches']).add_condition( ['after_batch'], OnLogStatusExceed('iterations_done', c['n_batches'])) ]) logger.info(extensions) ### Run training ### if "VISDOM_SERVER" in os.environ: print("Running visdom server") ret = subprocess.Popen([ os.path.join(os.path.dirname(__file__), "../visdom_plotter.py"), "--visdom-server={}".format(os.environ['VISDOM_SERVER']), "--folder={}".format(save_path) ]) time.sleep(0.1) if ret.returncode is not None: raise Exception() atexit.register(lambda: os.kill(ret.pid, signal.SIGINT)) model = Model(cost) for p, m in pop_updates: model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p main_loop = MainLoop(algorithm, training_stream, model=model, extensions=extensions) assert os.path.exists(save_path) main_loop.run()
cost_monitor.name = "cost_monitor" elif conf.task == 'framewise': cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m) cost_train.name = 'cost' cost_monitor = cost_train else: raise ValueError, conf.task recognizer.initialize() cg = ComputationGraph([cost_train, y_hat, x_m, y, y_m]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg = apply_noise(cg, weights, conf.weight_noise) #************* training algorithm ************* model = Model(cost_train) if conf.step_rule == 'AdaDelta': step_rule = AdaDelta() elif conf.step_rule == 'Momentum': step_rule = Momentum(learning_rate=conf.learning_rate, momentum=conf.momentum) else: raise ('step_rule not known: {}'.format(conf.step_rule)) step_rule = CompositeRule([step_rule, StepClipping(conf.step_clipping)]) algorithm = GradientDescent(cost=cost_train, parameters=cg.parameters, step_rule=step_rule) #***************** main loop **************** train_monitor = TrainingDataMonitoring([cost_monitor], prefix="train")
def main(): x = T.tensor3('features') m = T.matrix('features_mask') y = T.imatrix('targets') embedding_size = 300 glove_version = "glove.6B.300d.txt" #embedding_size = 50 #glove_version = "vectors.6B.50d.txt" o = x.sum(axis=1) + m.mean() * 0 score_layer = Linear( input_dim = 300, output_dim = 1, weights_init = IsotropicGaussian(std=0.02), biases_init = Constant(0.), name="linear2") score_layer.initialize() o = score_layer.apply(o) probs = Sigmoid().apply(o) cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean() misclassification.name = 'misclassification' # ================= cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost = cg.outputs[0], params=params, step_rule = CompositeRule([ StepClipping(threshold=4), AdaM(), ]) ) # ======== print "setting up data" ports = { 'gpu0_train' : 5557, 'gpu0_test' : 5558, 'gpu1_train' : 5559, 'gpu1_test' : 5560, } #batch_size = 16 batch_size = 16 def start_server(port, which_set): fuel.server.logger.setLevel('WARN') dataset = IMDBText(which_set, sorted=True) n_train = dataset.num_examples #scheme = ShuffledScheme(examples=n_train, batch_size=batch_size) scheme = BatchwiseShuffledScheme(examples=n_train, batch_size=batch_size) stream = DataStream( dataset=dataset, iteration_scheme=scheme) print "loading glove" glove = GloveTransformer(glove_version, data_stream=stream) padded = Padding( data_stream=glove, mask_sources=('features',) ) fuel.server.start_server(padded, port=port, hwm=20) train_port = ports[theano.config.device + '_train'] train_p = Process(target=start_server, args=(train_port, 'train')) train_p.start() test_port = ports[theano.config.device + '_test'] test_p = Process(target=start_server, args=(test_port, 'test')) test_p.start() train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port) test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port) print "setting up model" n_examples = 25000 #====== model = Model(cost) extensions = [] extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1)) extensions.append(TrainingDataMonitoring( [ cost, misclassification, ], prefix='train', after_epoch=True )) #extensions.append(DataStreamMonitoring( #[cost, misclassification], #data_stream=test_stream, #prefix='test', #after_epoch=True #)) extensions.append(Timing()) extensions.append(Printing()) extensions.append(Plot( theano.config.device+"_result", channels=[['train_cost']], after_epoch=True )) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def train_rnnrbm(train, rnnrbm, epochs=1000, test=None, bokeh=True, load_path=None): cdk = theano.shared(10) lr = theano.shared(float32(0.004)) cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk) error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask) error_rate.name = "error on note as a whole" mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask) mistake_rate.name = "single error within note" cost.name = 'rbm_cost' model = Model(cost) cg = ComputationGraph([cost]) step_rule = CompositeRule([ RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0), RemoveNotFinite() ]) # Scale(0.01) gradients = dict( equizip(cg.parameters, T.grad(cost, cg.parameters, consider_constant=[v_sample]))) algorithm = GradientDescent(step_rule=step_rule, gradients=gradients, cost=cost, params=cg.parameters) algorithm.add_updates(cg.updates) extensions = [ SharedVariableModifier(parameter=cdk, function=lambda n, v: rnnrbm_cdk[n] if rnnrbm_cdk.get(n) else v), SharedVariableModifier(parameter=lr, function=lambda n, v: float32(0.78 * v) if n % (200 * 5) == 0 else v), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( [ cost, error_rate, mistake_rate, ], # hidden_states, debug_val, param_nans, # aggregation.mean(algorithm.total_gradient_norm)], #+ params, prefix="train", after_epoch=False, every_n_batches=40), Timing(), Printing(), ProgressBar() ] if test is not None: extensions.append( DataStreamMonitoring([cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test", after_epoch=False, every_n_batches=40)) if bokeh: extensions.append( Plot( 'Training RNN-RBM', channels=[ [ 'train_error on note as a whole', 'train_single error within note', 'test_error on note as a whole', 'test_single error within note' ], ['train_final_cost'], # ['train_total_gradient_norm'], ])) main_loop = MainLoop(algorithm=algorithm, data_stream=train, model=model, extensions=extensions) return main_loop
# Build datastream dump_path = os.path.join("model_params", model_name+".pkl") valid_path = os.path.join(os.getcwd(), "squad_rare/dev-v1.0_tokenized.json") vocab_path = os.path.join(os.getcwd(), "squad_rare/vocab.txt") # ds, valid_stream = data.setup_squad_datastream(valid_path, vocab_path, config) ds, valid_stream = data.setup_squad_ranker_datastream(os.path.join(os.getcwd(),'squad_short/squadnewdev.txt'),os.path.join(os.getcwd(), 'squad/vocab.txt'),config, 221697) snapshot_path = os.path.join("model_params", model_name+".pkl") # Build model m = config.Model(config, ds.vocab_size) # Build the Blocks stuff for training # test_model = Model(m.generations) test_model = Model(m.predictions) model = Model(m.sgd_cost) algorithm = None extensions = [RankerEvaluator(path=snapshot_path, model=test_model, data_stream=valid_stream, vocab_size = ds.vocab_size, vocab = ds.vocab, eval_mode=eval_mode, before_training=True)] main_loop = MainLoop( model=model, data_stream=valid_stream, algorithm=algorithm, extensions=extensions ) for extension in main_loop.extensions: extension.main_loop = main_loop
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim): x_dim = 28 * 28 img_height, img_width = (28, 28) rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e" % value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) name = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (name, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel(n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_valid = BinarizedMNIST("valid", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) train_stream = DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size)) valid_stream = DataStream(mnist_valid, iteration_scheme=SequentialScheme( mnist_valid.num_examples, batch_size)) test_stream = DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size)) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), Checkpoint(name + ".pkl", save_separately=['log', 'model']), #Dump(name), Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def main(dataset_name='sklearn', num_epochs=100): x = tensor.matrix('features') y = tensor.lmatrix('targets') logistic_regressor = LogisticRegressor(input_dim=2) probs = logistic_regressor.get_probs(features=x) params = logistic_regressor.get_params() cost = logistic_regressor.get_cost(probs=probs, targets=y).mean() cost.name = 'cost' misclassification = logistic_regressor.get_misclassification( probs=probs, targets=y).mean() misclassification.name = 'misclassification' train_dataset, test_dataset = build_2d_datasets(dataset_name=dataset_name) algorithm = GradientDescent(cost=cost, params=params, step_rule=Scale(learning_rate=0.1)) train_data_stream = ForceFloatX( data_stream=DataStream(dataset=train_dataset, iteration_scheme=SequentialScheme( examples=train_dataset.num_examples, batch_size=40, ))) test_data_stream = ForceFloatX( data_stream=DataStream(dataset=test_dataset, iteration_scheme=SequentialScheme( examples=test_dataset.num_examples, batch_size=400, ))) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append( DataStreamMonitoring([cost, misclassification], test_data_stream, prefix='test')) extensions.append( TrainingDataMonitoring([cost, misclassification], prefix='train', after_epoch=True)) scoring_function = theano.function([x], probs) plotters = [] plotters.append( Plotter(channels=[[ 'test_cost', 'test_misclassification', 'train_cost', 'train_misclassification' ]], titles=['Costs'])) score_train_stream = Mapping(data_stream=copy.deepcopy(train_data_stream), mapping=TupleMapping(scoring_function), add_sources=('scores', )) score_test_stream = Mapping(data_stream=copy.deepcopy(test_data_stream), mapping=TupleMapping(scoring_function), add_sources=('scores', )) plotters.append( Display2DData(data_streams=[ score_train_stream, copy.deepcopy(train_data_stream), score_test_stream, copy.deepcopy(test_data_stream) ], radius=0.01)) extensions.append( PlotManager('2D examples', plotters=plotters, after_epoch=False, every_n_epochs=50, after_training=True)) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(config, tr_stream, dev_stream, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') initial_context = tensor.matrix('initial_context') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = InitialContextDecoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['context_dim']) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, initial_context) cost.name = 'decoder_cost' logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_parameters().values() enc_params += Selector(encoder.fwd_fork).get_parameters().values() enc_params += Selector(encoder.back_fork).get_parameters().values() dec_params = Selector( decoder.sequence_generator.readout).get_parameters().values() dec_params += Selector( decoder.sequence_generator.fork).get_parameters().values() dec_params += Selector(decoder.transition.initial_transformer).get_parameters().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff']) # TODO: weight noise for recurrent params isn't currently implemented -- see config['weight_noise_rec'] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions # TODO: add checking for existing model and loading logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Create the theano variables that we need for the sampling graph sampling_input = tensor.lmatrix('input') sampling_context = tensor.matrix('context_input') # WORKING: change this part to account for the new initial context for decoder # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation, sampling_context) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling # TODO: currently commented because we need to modify the sampler to use the contexts if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab=source_vocab, trg_vocab=target_vocab, src_vocab_size=config['src_vocab_size'], )) # TODO: add sampling_context to BleuValidator and Sampler # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, sampling_context, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[['decoder_cost', 'validation_set_bleu_score']], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent( cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) else: algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # enrich the logged information extensions.append( Timing(every_n_batches=100) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def main(mode, save_path, steps, num_batches): num_states = MarkovChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator(Readout( readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( MarkovChainDataset.entropy)) logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len)) # Build the cost computation graph. x = tensor.lmatrix('data') cost = aggregation.mean( generator.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" algorithm = GradientDescent( cost=cost, params=list(Selector(generator).get_params().values()), step_rule=Scale(0.001)) main_loop = MainLoop(algorithm=algorithm, data_stream=DataStream( MarkovChainDataset(rng, seq_len), iteration_scheme=ConstantScheme(batch_size)), model=Model(cost), extensions=[ FinishAfter(after_n_batches=num_batches), TrainingDataMonitoring([cost], prefix="this_step", after_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), Checkpoint(save_path, every_n_batches=500), Printing(every_n_batches=100) ]) main_loop.run() elif mode == "sample": main_loop = cPickle.load(open(save_path, "rb")) generator = main_loop.model sample = ComputationGraph( generator.generate(n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, MarkovChainDataset.trans_prob)) else: assert False
def main(config, tr_stream, dev_stream): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = tr_stream.trg_bos target_space_idx = tr_stream.space_idx['target'] # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.decimator.bidir_w.prototype.recurrent.weights_init = Orthogonal() for layer_n in range(config['src_dgru_depth']): encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['bidir_encoder_depth']): encoder.children[1 + layer_n].prototype.recurrent.weights_init = Orthogonal() if config['trg_igru_depth'] == 1: decoder.interpolator.igru.weights_init = Orthogonal() else: for layer_n in range(config['trg_igru_depth']): decoder.interpolator.igru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['trg_dgru_depth']): decoder.interpolator.feedback_brick.dgru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['transition_depth']): decoder.transition.transitions[layer_n].weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(str(shape), count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(str(value.get_value().shape), name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set up training algorithm logger.info("Initializing training algorithm") # You could use 1e-4 in Adam, however manually decay will be faster. # We decay it to 5e-4 when trained for about 30K # then decay it to 2e-4 when trained for about 90K # finally set it to 1e-4 when trained for about 180K algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), Adam(learning_rate=1e-3)])) # Set extensions logger.info("Initializing extensions") # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) train_monitor = CostCurve([cost, gradient_norm, step_norm], config=config, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [ train_monitor, Timing(), Printing(every_n_batches=config['print_freq']), FinishAfter(after_n_batches=config['finish_after']), CheckpointNMT(saveto=config['saveto'], dump_freq=config['dump_freq'], every_n_batches=config['save_freq'], )] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1: logger.info("Building sampling model") generated = decoder.generate(representation, source_word_mask) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[config['transition_depth']])) # generated[transition_depth] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], transition_depth=config['transition_depth'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def pretrain_rnn(train, rnnrbm, test=None, epochs=1000, bokeh=True): lr = theano.shared(float32(0.1)) probs, _, _, _ = rnnrbm.rnn_pretrain_pred(x, x_mask) cost = NegativeLogLikelihood().apply(y, probs, y_mask) error_rate = MismulitclassificationRate().apply(y, probs, y_mask) error_rate.name = "error on note as a whole" mistake_rate = MismulitmistakeRate().apply(y, probs, y_mask) mistake_rate.name = "single error within note" cost.name = 'final_cost' model = Model(cost) cg = ComputationGraph([cost]) step_rule = CompositeRule([ RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0), RemoveNotFinite() ]) algorithm = GradientDescent(step_rule=step_rule, cost=cost, params=cg.parameters) extensions = [ SharedVariableModifier(parameter=lr, function=lambda n, v: float32(0.7 * v) if n % 700 == 0 else v), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( [ cost, error_rate, mistake_rate, ], # hidden_states, debug_val, param_nans, # aggregation.mean(algorithm.total_gradient_norm)], #+ params, prefix="train", after_epoch=False, every_n_batches=40), Timing(), Printing(), ProgressBar() ] if test is not None: extensions.append( DataStreamMonitoring([cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test", after_epoch=False, every_n_batches=40)) if bokeh: extensions.append( Plot( 'Pretrain RNN', channels=[ [ 'train_error on note as a whole', 'train_single error within note', 'test_error on note as a whole', 'test_single error within note' ], ['train_rbm_cost'], # ['train_total_gradient_norm'], ])) main_loop = MainLoop(algorithm=algorithm, data_stream=train, model=model, extensions=extensions) return main_loop
def main(save_to, cost_name, learning_rate, momentum, num_epochs): mlp = MLP([None], [784, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') scores = mlp.apply(x) batch_size = y.shape[0] indices = tensor.arange(y.shape[0]) target_scores = tensor.set_subtensor( tensor.zeros((batch_size, 10))[indices, y.flatten()], 1) score_diff = scores - target_scores # Logistic Regression if cost_name == 'lr': cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean() # MSE elif cost_name == 'mse': cost = (score_diff**2).mean() # Perceptron elif cost_name == 'perceptron': cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean() # TLE elif cost_name == 'minmin': cost = abs(score_diff[indices, y.flatten()]).mean() cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean() # TLEcut elif cost_name == 'minmin_cut': # Score of the groundtruth should be greater or equal than its target score cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean() # Score of the prediction should be less or equal than its actual score cost += tensor.maximum(0, score_diff[indices, scores.argmax(axis=1)]).mean() # TLE2 elif cost_name == 'minmin2': cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()])**2).mean() cost += ((score_diff[tensor.arange(y.shape[0]), scores.argmax(axis=1)])**2).mean() # Direct loss minimization elif cost_name == 'direct': epsilon = 0.1 cost = (-scores[indices, (scores + epsilon * target_scores).argmax(axis=1)] + scores[indices, scores.argmax(axis=1)]).mean() cost /= epsilon elif cost_name == 'svm': cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] - scores[indices, y.flatten()]).mean() else: raise ValueError("Unknown cost " + cost) error_rate = MisclassificationRate().apply(y.flatten(), scores) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) cost.name = 'cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) if learning_rate == None: learning_rate = 0.0001 if momentum == None: momentum = 0.0 rule = Momentum(learning_rate=learning_rate, momentum=momentum) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=rule) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), # CallbackExtension( # lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9), # after_epoch=True), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[['test_cost', 'test_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run() df = pandas.DataFrame.from_dict(main_loop.log, orient='index') res = { 'cost': cost_name, 'learning_rate': learning_rate, 'momentum': momentum, 'train_cost': df.train_cost.iloc[-1], 'test_cost': df.test_cost.iloc[-1], 'best_test_cost': df.test_cost.min(), 'train_error': df.train_error_rate.iloc[-1], 'test_error': df.test_error_rate.iloc[-1], 'best_test_error': df.test_error_rate.min() } res = { k: float(v) if isinstance(v, numpy.ndarray) else v for k, v in res.items() } json.dump(res, sys.stdout) sys.stdout.flush()
def generate_embeddings(config, tar_path, part, dest_path, format_, average=False, encoder_embeddings=None, **kwargs): """ generate embeddings for all the defintions, average them and serialize OR if encoder_embeddings, serialize the models' encoder embeddings config: name of the config of the model tar_path: tar path of the model parameters part: part of the dataset (should be either 'train', 'valid', 'test' or 'all') dest_path: directory where the serialized embeddings will be written format: either 'dict' or 'glove' encoder_embeddings: None, 'only', 'mixed', 'if_missing' - None: don't include encoder embeddings - 'only': don't read any data, just serialize the encoder embeddings - 'mixed': add the encoder embeddings to the list of definition embeddings - 'if_missing': add the encoder embeddings when there is no corresponding def average: if true, multi-prototype embeddings will be averaged """ if not os.path.exists(dest_path): os.makedirs(dest_path) c = config data, model = initialize_data_and_model(c, train_phase=False) words = T.ltensor3('words') words_mask = T.matrix('words_mask') keys = T.lmatrix('keys') n_identical_keys = T.lvector('n_identical_keys') sym_args = [words, words_mask] if format_ not in ['dict', 'glove']: raise ValueError("format should be either: dict, glove") if not c['encoder'] and encoder_embeddings != 'only': raise ValueError('Error: this model does not have an encoder.') if use_keys(c): sym_args.append(keys) if use_n_identical_keys(c): sym_args.append(n_identical_keys) costs = model.apply(*sym_args, train_phase=False) cg = Model(costs) with open(tar_path) as src: cg.set_parameter_values(load_parameters(src)) if encoder_embeddings: if encoder_embeddings == 'only' and not c['encoder']: embeddings_array = model.get_def_embeddings_params('key').eval() else: embeddings_array = model.get_def_embeddings_params('main').eval() entries = model.get_embeddings_entries() enc_embeddings = { e: np.asarray(a) for e, a in zip(entries, embeddings_array) } if encoder_embeddings == 'only': serialize_embeddings(enc_embeddings, format_, dest_path, "encoder_embeddings") return 0 embeddings_var, = VariableFilter(name='embeddings')(cg) compute = dict({"embeddings": embeddings_var}) if c['proximity_coef'] != 0: prox_var, = VariableFilter(name='proximity_term')(cg) compute["proximity_term"] = prox_var print "sym args", sym_args predict_f = theano.function(sym_args, compute) batch_size = 256 # size of test_unseen stream = data.get_stream(part, batch_size=batch_size, max_length=c['max_length'], remove_keys=False, remove_n_identical_keys=False) raw_data = [] # list of dicts containing the inputs and computed outputs i = 0 vocab = model._vocab print "start computing" embeddings = defaultdict(list) for input_data in stream.get_epoch_iterator(as_dict=True): if i % 10 == 0: print "iteration:", i words = input_data['words'] words_mask = input_data['words_mask'] keys = input_data['keys'] n_identical_keys = input_data['n_identical_keys'] args = [words, words_mask] if use_keys(c): args.append(keys) if use_n_identical_keys(c): args.append(n_identical_keys) to_save = predict_f(*args) for k, h in zip(keys, to_save['embeddings']): key = vec2str(k) if encoder_embeddings == 'if_missing': try: del enc_embeddings[key] except KeyError: pass embeddings[key].append(h) i += 1 if encoder_embeddings in ['mixed', 'if_missing']: for k, e in enc_embeddings.iteritems(): embeddings[k].append(e) if encoder_embeddings == 'mixed': prefix_fname = 'mix_e_' elif encoder_embeddings == 'if_missing': prefix_fname = 'if_mis_e_' else: prefix_fname = '' # combine: if average: mean_embeddings = {} for k in embeddings.keys(): mean_embeddings[k] = np.mean(np.asarray(embeddings[k]), axis=0) serialize_embeddings(mean_embeddings, format_, dest_path, prefix_fname + "mean_embeddings") else: serialize_embeddings(embeddings, format_, dest_path, prefix_fname + "embeddings")
def evaluate(c, tar_path, *args, **kwargs): """ Performs rudimentary evaluation of SNLI/MNLI run * Runs on valid and test given network * Saves all predictions * Saves embedding matrix * Saves results.json and predictions.csv """ # Load and configure model = kwargs['model'] assert c.endswith("json") c = json.load(open(c)) # Very ugly absolute path fix ABS_PATHS = [ "data/", "/mnt/users/jastrzebski/local/dict_based_learning/data/", "/data/cf9ffb48-61bd-40dc-a011-b2e7e5acfd72/" ] from six import string_types for abs_path in ABS_PATHS: for k in c: if isinstance(c[k], string_types): if c[k].startswith(abs_path): c[k] = c[k][len(abs_path):] # Make data paths nice for path in [ 'dict_path', 'embedding_def_path', 'embedding_path', 'vocab', 'vocab_def', 'vocab_text' ]: if c.get(path, ''): if not os.path.isabs(c[path]): c[path] = os.path.join(fuel.config.data_path[0], c[path]) logging.info("Updating config with " + str(kwargs)) c.update(**kwargs) # NOTE: This assures we don't miss crucial definition for some def heavy words # usually it is a good idea c['max_def_per_word'] = c['max_def_per_word'] * 2 assert tar_path.endswith("tar") dest_path = os.path.dirname(tar_path) prefix = os.path.splitext(os.path.basename(tar_path))[0] s1_decoded, s2_decoded = T.lmatrix('sentence1'), T.lmatrix('sentence2') if c['dict_path']: s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix( 'sentence2_def_map') def_mask = T.fmatrix("def_mask") defs = T.lmatrix("defs") else: s1_def_map, s2_def_map = None, None def_mask = None defs = None s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask') if model == 'simple': model, data, used_dict, used_retrieval, used_vocab = _initialize_simple_model_and_data( c) elif model == 'esim': model, data, used_dict, used_retrieval, used_vocab = _initialize_esim_model_and_data( c) else: raise NotImplementedError() pred = model.apply(s1_decoded, s1_mask, s2_decoded, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=False) cg = ComputationGraph([pred]) if c.get("bn", True): bn_params = [ p for p in VariableFilter(bricks=[BatchNormalization])(cg) if hasattr(p, "set_value") ] else: bn_params = [] # Load model model = Model(cg.outputs) parameters = model.get_parameter_dict() # Blocks version mismatch logging.info( "Trainable parameters" + "\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ])], width=120)) logging.info("# of parameters {}".format( sum([ np.prod(parameters[key].get_value().shape) for key in sorted([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ]) ]))) with open(tar_path) as src: params = load_parameters(src) loaded_params_set = set(params.keys()) model_params_set = set([ get_brick(param).get_hierarchical_name(param) for param in cg.parameters ]) logging.info("Loaded extra parameters") logging.info(loaded_params_set - model_params_set) logging.info("Missing parameters") logging.info(model_params_set - loaded_params_set) model.set_parameter_values(params) if c.get("bn", True): logging.info("Loading " + str([ get_brick(param).get_hierarchical_name(param) for param in bn_params ])) for param in bn_params: param.set_value( params[get_brick(param).get_hierarchical_name(param)]) for p in bn_params: model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p # Read logs logs = pd.read_csv(os.path.join(dest_path, "logs.csv")) best_val_acc = logs['valid_misclassificationrate_apply_error_rate'].min() logging.info("Best measured valid acc: " + str(best_val_acc)) # NOTE(kudkudak): We need this to have comparable mean rank and embedding scores reference_vocab = Vocabulary( os.path.join(fuel.config.data_path[0], c['data_path'], 'vocab.txt')) vocab_all = Vocabulary( os.path.join( fuel.config.data_path[0], c['data_path'], 'vocab_all.txt')) # Can include OOV words, which is interesting retrieval_all = Retrieval(vocab_text=used_vocab, dictionary=used_dict, max_def_length=c['max_def_length'], exclude_top_k=0, max_def_per_word=c['max_def_per_word']) # logging.info("Calculating dict and word embeddings for vocab.txt and vocab_all.txt") # for name in ['s1_word_embeddings', 's1_dict_word_embeddings']: # variables = VariableFilter(name=name)(cg) # if len(variables): # s1_emb = variables[0] # # A bit sloppy about downcast # # if "dict" in name: # embedder = construct_dict_embedder( # theano.function([s1_decoded, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True), # vocab=data.vocab, retrieval=retrieval_all) # else: # embedder = construct_embedder(theano.function([s1_decoded], s1_emb, allow_input_downcast=True), # vocab=data.vocab) # # for v_name, v in [("vocab_all", vocab_all), ("vocab", reference_vocab)]: # logging.info("Calculating {} embeddings for {}".format(name, v_name)) # Predict predict_fnc = theano.function(cg.inputs, pred) results = {} batch_size = 14 for subset in ['valid', 'test']: logging.info("Predicting on " + subset) stream = data.get_stream(subset, batch_size=batch_size, seed=778) it = stream.get_epoch_iterator() rows = [] for ex in tqdm.tqdm(it, total=10000 / batch_size): ex = dict(zip(stream.sources, ex)) inp = [ex[v.name] for v in cg.inputs] prob = predict_fnc(*inp) label_pred = np.argmax(prob, axis=1) for id in range(len(prob)): s1_decoded = used_vocab.decode(ex['sentence1'][id]).split() s2_decoded = used_vocab.decode(ex['sentence2'][id]).split() assert used_vocab == data.vocab s1_decoded = [ '*' + w + '*' if used_vocab.word_to_id(w) > c['num_input_words'] else w for w in s1_decoded ] s2_decoded = [ '*' + w + '*' if used_vocab.word_to_id(w) > c['num_input_words'] else w for w in s2_decoded ] # Different difficulty metrics # text_unk_percentage s1_no_pad = [w for w in ex['sentence1'][id] if w != 0] s2_no_pad = [w for w in ex['sentence2'][id] if w != 0] s1_unk_percentage = sum([ 1. for w in s1_no_pad if w == used_vocab.unk ]) / len(s1_no_pad) s2_unk_percentage = sum([ 1. for w in s1_no_pad if w == used_vocab.unk ]) / len(s2_no_pad) # mean freq word s1_mean_freq = np.mean([ 0 if w == data.vocab.unk else used_vocab._id_to_freq[w] for w in s1_no_pad ]) s2_mean_freq = np.mean([ 0 if w == data.vocab.unk else used_vocab._id_to_freq[w] for w in s2_no_pad ]) # mean rank word (UNK is max rank) # NOTE(kudkudak): Will break if we reindex unk between vocabs :P s1_mean_rank = np.mean([ reference_vocab.size() if reference_vocab.word_to_id( used_vocab.id_to_word(w)) == reference_vocab.unk else reference_vocab.word_to_id(used_vocab.id_to_word(w)) for w in s1_no_pad ]) s2_mean_rank = np.mean([ reference_vocab.size() if reference_vocab.word_to_id( used_vocab.id_to_word(w)) == reference_vocab.unk else reference_vocab.word_to_id(used_vocab.id_to_word(w)) for w in s2_no_pad ]) rows.append({ "pred": label_pred[id], "true_label": ex['label'][id], "s1": ' '.join(s1_decoded), "s2": ' '.join(s2_decoded), "s1_unk_percentage": s1_unk_percentage, "s2_unk_percentage": s2_unk_percentage, "s1_mean_freq": s1_mean_freq, "s2_mean_freq": s2_mean_freq, "s1_mean_rank": s1_mean_rank, "s2_mean_rank": s2_mean_rank, "p_0": prob[id, 0], "p_1": prob[id, 1], "p_2": prob[id, 2] }) preds = pd.DataFrame(rows, columns=rows[0].keys()) preds.to_csv( os.path.join(dest_path, prefix + '_predictions_{}.csv'.format(subset))) results[subset] = {} results[subset]['misclassification'] = 1 - np.mean( preds.pred == preds.true_label) if subset == "valid" and np.abs( (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc) > 0.001: logging.error("!!!") logging.error( "Found different best_val_acc. Probably due to changed specification of the model class." ) logging.error("Discrepancy {}".format( (1 - np.mean(preds.pred == preds.true_label)) - best_val_acc)) logging.error("!!!") logging.info(results) json.dump(results, open(os.path.join(dest_path, prefix + '_results.json'), "w"))
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) topical_transformer = topicalq_transformer( config['source_topic_vocab_size'], config['topical_embedding_dim'], config['enc_nhids'], config['topical_word_num'], config['batch_size']) decoder = Decoder(vocab_size=config['trg_vocab_size'], topicWord_size=config['trg_topic_vocab_size'], embedding_dim=config['dec_embed'], topical_dim=config['topical_embedding_dim'], state_dim=config['dec_nhids'], representation_dim=config['enc_nhids'] * 2, match_function=config['match_function'], use_doubly_stochastic=config['use_doubly_stochastic'], lambda_ds=config['lambda_ds'], use_local_attention=config['use_local_attention'], window_size=config['window_size'], use_step_decay_cost=config['use_step_decay_cost'], use_concentration_cost=config['use_concentration_cost'], lambda_ct=config['lambda_ct'], use_stablilizer=config['use_stablilizer'], lambda_st=config['lambda_st']) # here attended dim (representation_dim) of decoder is 2*enc_nhinds # because the context given by the encoder is a bidirectional context if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') target_topic_sentence = tensor.lmatrix('target_topic') target_topic_binary_sentence = tensor.lmatrix('target_binary_topic') #target_topic_sentence_mask=tensor.lmatrix('target_topic_mask'); sampling_input = tensor.lmatrix('input') source_topical_word = tensor.lmatrix('source_topical') source_topical_mask = tensor.matrix('source_topical_mask') topic_embedding = topical_transformer.apply(source_topical_word) # Get training and development set streams tr_stream = get_tr_stream_with_topic_target(**config) #dev_stream = get_dev_tr_stream_with_topic_target(**config) # Get cost of the model representations = encoder.apply(source_sentence, source_sentence_mask) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representations[0, :, (representations.shape[2] / 2):] cost = decoder.cost(representations, source_sentence_mask, tw_representation, source_topical_mask, target_sentence, target_sentence_mask, target_topic_sentence, target_topic_binary_sentence, topic_embedding, content_embedding) logger.info('Creating computational graph') perplexity = tensor.exp(cost) perplexity.name = 'perplexity' cg = ComputationGraph(cost) costs_computer = function([ target_sentence, target_sentence_mask, source_sentence, source_sentence_mask, source_topical_word, target_topic_sentence, target_topic_binary_sentence ], (perplexity), on_unused_input='ignore') # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() topical_transformer.weights_init = IsotropicGaussian( config['weight_scale']) topical_transformer.biases_init = Constant(0) topical_transformer.push_allocation_config() #don't know whether the initialize is for topical_transformer.look_up.weights_init = Orthogonal() topical_transformer.transformer.weights_init = Orthogonal() topical_transformer.initialize() word_topical_embedding = cPickle.load( open(config['topical_embeddings'], 'rb')) np_word_topical_embedding = numpy.array(word_topical_embedding, dtype='float32') topical_transformer.look_up.W.set_value(np_word_topical_embedding) topical_transformer.look_up.W.tag.role = [] # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([perplexity], after_batch=True), CheckpointNMT(config['saveto'], config['model_name'], every_n_batches=config['save_freq']) ] # # Set up beam search and sampling computation graphs if necessary # if config['hook_samples'] >= 1 or config['bleu_script'] is not None: # logger.info("Building sampling model") # sampling_representation = encoder.apply( # sampling_input, tensor.ones(sampling_input.shape)) # generated = decoder.generate( # sampling_input, sampling_representation) # search_model = Model(generated) # _, samples = VariableFilter( # bricks=[decoder.sequence_generator], name="outputs")( # ComputationGraph(generated[1])) # # # Add sampling # if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # model_name=config['model_name'], # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # # # Add early stopping based on bleu # if False: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'], # n_best=3, # track_n_models=6)) # # logger.info("Building perplexity validator") # extensions.append( # pplValidation( config=config, # model=costs_computer, data_stream=dev_stream, # model_name=config['model_name'], # every_n_batches=config['sampling_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) initial_learning_rate = config['initial_learning_rate'] log_path = os.path.join(config['saveto'], 'log') if config['reload'] and os.path.exists(log_path): with open(log_path, 'rb') as source: log = cPickle.load(source) last = max(log.keys()) - 1 if 'learning_rate' in log[last]: initial_learning_rate = log[last]['learning_rate'] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ Scale(initial_learning_rate), StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='ignore') _learning_rate = algorithm.step_rule.components[0].learning_rate if config['learning_rate_decay']: extensions.append( LearningRateHalver(record_name='validation_cost', comparator=lambda x, y: x > y, learning_rate=_learning_rate, patience_default=3)) else: extensions.append(OldModelRemover(saveto=config['saveto'])) if config['learning_rate_grow']: extensions.append( LearningRateDoubler(record_name='validation_cost', comparator=lambda x, y: x < y, learning_rate=_learning_rate, patience_default=3)) extensions.append( SimplePrinting(config['model_name'], after_batch=True)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') source_topical_word = tensor.lmatrix('source_topical') tw_vocab_overlap = tensor.lmatrix('tw_vocab_overlap') tw_vocab_overlap_matrix = cPickle.load( open(config['tw_vocab_overlap'], 'rb')) tw_vocab_overlap_matrix = numpy.array(tw_vocab_overlap_matrix, dtype='int32') #tw_vocab_overlap=shared(tw_vocab_overlap_matrix); topic_embedding = topical_transformer.apply(source_topical_word) sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 trg_vocab = _ensure_special_tokens(cPickle.load( open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) topic_embedding = topical_transformer.apply(source_topical_word) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = sampling_representation[0, :, ( sampling_representation.shape[2] / 2):] generated = decoder.generate(sampling_input, sampling_representation, tw_representation, topical_embedding=topic_embedding, content_embedding=content_embedding) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']) loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started translation: ") test_stream = get_dev_stream_with_topicalq(**config) ts = test_stream.get_epoch_iterator() rts = open(config['val_set_source']).readlines() ftrans_original = open(config['val_output_orig'], 'w') saved_weights = [] total_cost = 0.0 pbar = ProgressBar(max_value=len(rts)).start() for i, (line, line_raw) in enumerate(zip(ts, rts)): trans_in = line_raw.split() seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) seq1 = line[1] input_topical = numpy.tile(seq1, (config['beam_size'], 1)) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs, attendeds, weights = \ beam_search.search( input_values={sampling_input: input_,source_topical_word:input_topical,tw_vocab_overlap:tw_vocab_overlap_matrix}, tw_vocab_overlap=tw_vocab_overlap_matrix, max_length=3*len(seq), eol_symbol=trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] weight = weights[best][:, :len(trans_in)] trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' saved_weights.append(weight) print(' '.join(trans_out), file=ftrans_original) pbar.update(i + 1) pbar.finish() logger.info("Total cost of the test: {}".format(total_cost)) cPickle.dump(saved_weights, open(config['attention_weights'], 'wb')) ftrans_original.close() # ap = afterprocesser(config) # ap.main() elif mode == 'score': logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') target_topic_sentence = tensor.lmatrix('target_topic') target_topic_binary_sentence = tensor.lmatrix('target_binary_topic') source_topical_word = tensor.lmatrix('source_topical') topic_embedding = topical_transformer.apply(source_topical_word) # Get cost of the model representations = encoder.apply(source_sentence, source_sentence_mask) costs = decoder.cost(representations, source_sentence_mask, target_sentence, target_sentence_mask, target_topic_sentence, target_topic_binary_sentence, topic_embedding) config['batch_size'] = 1 config['sort_k_batches'] = 1 # Get test set stream test_stream = get_tr_stream_with_topic_target(**config) logger.info("Building sampling model") logger.info("Loading the model..") model = Model(costs) loader = LoadNMT(config['validation_load']) loader.set_model_parameters(model, loader.load_parameters_default()) costs_computer = function([ target_sentence, target_sentence_mask, source_sentence, source_sentence_mask, source_topical_word, target_topic_sentence, target_topic_binary_sentence ], (costs), on_unused_input='ignore') iterator = test_stream.get_epoch_iterator() scores = [] att_weights = [] for i, (src, src_mask, trg, trg_mask, te, te_mask, tt, tt_mask, tb, tb_mask) in enumerate(iterator): costs = costs_computer(*[trg, trg_mask, src, src_mask, te, tt, tb]) cost = costs.sum() print(i, cost) scores.append(cost) print(sum(scores) / 10007)
def main(n_hiddens=1500, n_out=200, noise_std=0.5, learning_rate=2e-1, momentum=0.9, gamma=2.0): ###################### # Model ###################### import theano from theano.sandbox.rng_mrg import MRG_RandomStreams import numpy import theano.tensor as tensor from lvq.lvq import AaronLVQ, SupervisedNG, initialize_prototypes from blocks.bricks import Linear, LinearMaxout from blocks.bricks import Rectifier, Softmax, Logistic from blocks.bricks.cost import MisclassificationRate from blocks.config import config from blocks.initialization import Uniform, Constant from blocks.model import Model from lvq.batch_norm import MLP #from blocks.bricks import MLP seed = config.default_seed rng = MRG_RandomStreams(seed) x = tensor.tensor4('features') flat_x = tensor.flatten(x, outdim=2) flat_x_noise = flat_x + rng.normal(size=flat_x.shape, std=noise_std) y = tensor.imatrix('targets') flat_y = tensor.flatten(y, outdim=1) act = Rectifier() mlp = MLP(dims=[784, n_hiddens, n_hiddens, n_out], activations=[act, act, None]) mlp.weights_init = Uniform(0.0, 0.001) mlp.biases_init = Constant(0.0) mlp.initialize() train_out = mlp.apply(flat_x_noise) test_out = mlp.inference(flat_x) lamb = theano.shared(numpy.float32(10.0)) lvq = SupervisedNG(10, n_out, nonlin=True, gamma=gamma, lamb=lamb, name='lvq') test_loss, test_misclass = lvq.apply(test_out, flat_y, 'test') loss, misclass = lvq.apply(train_out, flat_y, 'train') model = Model(loss) ###################### # Data ###################### #from mnist import MNIST from fuel.datasets import MNIST from fuel.streams import DataStream from fuel.schemes import ShuffledScheme, SequentialScheme from fuel.transformers import ScaleAndShift, ForceFloatX mnist_train = MNIST([ 'train' ]) #MNIST('train', drop_input=False, sources=('features', 'targets')) mnist_test = MNIST([ 'test' ]) # MNIST('test', drop_input=False, sources=('features', 'targets')) batch_size = 100 #Batch size for training batch_size_mon = 2000 # Batch size for monitoring and batch normalization n_batches = int( numpy.ceil(float(mnist_train.num_examples) / batch_size_mon)) num_protos = 10 ind = range(mnist_train.num_examples) train_ind = ind[:50000] val_ind = ind[50000:] def preprocessing(data_stream): return ForceFloatX(ScaleAndShift(data_stream, 1 / 255.0, 0.0, which_sources=('features', )), which_sources=('features', )) train_stream_mon = preprocessing( DataStream(mnist_train, iteration_scheme=ShuffledScheme(train_ind, batch_size_mon))) train_stream_bn = preprocessing( DataStream(mnist_train, iteration_scheme=ShuffledScheme(train_ind, batch_size_mon))) train_stream = preprocessing( DataStream(mnist_train, iteration_scheme=ShuffledScheme(train_ind, batch_size))) valid_stream = preprocessing( DataStream(mnist_train, iteration_scheme=ShuffledScheme(val_ind, batch_size))) test_stream = preprocessing( DataStream(mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size_mon))) initialize_prototypes(lvq, x, train_out, train_stream_mon) ###################### # Training ###################### from blocks.main_loop import MainLoop from blocks.extensions import FinishAfter, Timing, Printing from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.algorithms import GradientDescent, Momentum, RMSProp, Adam from blocks.extensions.saveload import Checkpoint from lvq.extensions import EarlyStopping, LRDecay, MomentumSwitchOff, NCScheduler from lvq.batch_norm import BatchNormExtension lr = theano.shared(numpy.float32(1e-3)) step_rule = Momentum( learning_rate, 0.9 ) #Adam(learning_rate=lr) #RMSProp(learning_rate=1e-5, max_scaling=1e4) num_epochs = 100 earlystop = EarlyStopping('valid_lvq_apply_misclass', 100, './exp/alvq.pkl') main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=GradientDescent(cost=model.outputs[0], parameters=model.parameters, step_rule=step_rule), extensions=[ FinishAfter(after_n_epochs=num_epochs), BatchNormExtension(model, train_stream_bn, n_batches), LRDecay(step_rule.learning_rate, [20, 40, 60, 80]), MomentumSwitchOff(step_rule.momentum, num_epochs - 20), NCScheduler(lamb, 30., 0, num_epochs), DataStreamMonitoring(variables=[test_loss, test_misclass], data_stream=train_stream_mon, prefix='train'), DataStreamMonitoring(variables=[test_loss, test_misclass], data_stream=valid_stream, prefix='valid'), DataStreamMonitoring(variables=[test_loss, test_misclass], data_stream=test_stream, prefix='test'), Timing(), earlystop, Printing(after_epoch=True) ]) main_loop.run() return main_loop.status.get('best_test_lvq_apply_misclass', None)
def train(step_rule, label_dim, state_dim, epochs, seed, dropout, test_cost, experiment_path, features, weight_noise, to_watch, patience, batch_size, batch_norm, **kwargs): print '.. TIMIT experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() # ------------------------------------------------------------------------ # Streams rng = np.random.RandomState(seed) stream_args = dict(rng=rng, batch_size=batch_size) print '.. initializing iterators' train_dataset = Timit('train', features=features) train_stream = construct_stream(train_dataset, **stream_args) dev_dataset = Timit('dev', features=features) dev_stream = construct_stream(dev_dataset, **stream_args) test_dataset = Timit('test', features=features) test_stream = construct_stream(test_dataset, **stream_args) update_stream = construct_stream(train_dataset, n_batches=100, **stream_args) phone_dict = train_dataset.get_phoneme_dict() phoneme_dict = { k: phone_to_phoneme_dict[v] if v in phone_to_phoneme_dict else v for k, v in phone_dict.iteritems() } ind_to_phoneme = {v: k for k, v in phoneme_dict.iteritems()} eol_symbol = ind_to_phoneme['<STOP>'] # ------------------------------------------------------------------------ # Graph print '.. building model' x = T.tensor3('features') y = T.matrix('phonemes') input_mask = T.matrix('features_mask') output_mask = T.matrix('phonemes_mask') theano.config.compute_test_value = 'off' x.tag.test_value = np.random.randn(100, 24, 123).astype(floatX) y.tag.test_value = np.ones((30, 24), dtype=floatX) input_mask.tag.test_value = np.ones((100, 24), dtype=floatX) output_mask.tag.test_value = np.ones((30, 24), dtype=floatX) seq_len = 100 input_dim = 123 activation = Tanh() recurrent_init = IdentityInit(0.99) rec1 = TimLSTM(not batch_norm, input_dim, state_dim, activation, name='LSTM') rec1.initialize() l1 = Linear(state_dim, label_dim + 1, name='out_linear', weights_init=Orthogonal(), biases_init=Constant(0.0)) l1.initialize() o1 = rec1.apply(x) y_hat_o = l1.apply(o1) shape = y_hat_o.shape y_hat = Softmax().apply(y_hat_o.reshape((-1, shape[-1]))).reshape(shape) y_mask = output_mask y_hat_mask = input_mask # ------------------------------------------------------------------------ # Costs and Algorithm ctc_cost = T.sum( ctc.cpu_ctc_th(y_hat_o, T.sum(y_hat_mask, axis=0), y + T.ones_like(y), T.sum(y_mask, axis=0))) batch_cost = ctc_cost.copy(name='batch_cost') bs = y.shape[1] cost_train = aggregation.mean(batch_cost, bs).copy("sequence_cost") cost_per_character = aggregation.mean( batch_cost, output_mask.sum()).copy("character_cost") cg_train = ComputationGraph(cost_train) model = Model(cost_train) train_cost_per_character = aggregation.mean( cost_train, output_mask.sum()).copy("train_character_cost") algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters, on_unused_sources='warn') # ------------------------------------------------------------------------ # Monitoring and extensions parameters = model.get_parameter_dict() observed_vars = [ cost_train, train_cost_per_character, aggregation.mean(algorithm.total_gradient_norm) ] for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_monitor = DataStreamMonitoring( variables=[cost_train, cost_per_character], data_stream=dev_stream, prefix="dev") train_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, train_stream, prefix='train', every_n_epochs=1, before_training=True, phoneme_dict=phoneme_dict, black_list=black_list, train=True) dev_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, dev_stream, prefix='dev', every_n_epochs=1, phoneme_dict=phoneme_dict, black_list=black_list) extensions = [] if 'load_path' in kwargs: extensions.append(Load(kwargs['load_path'])) extensions.extend([ FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor, train_ctc_monitor, dev_ctc_monitor ]) if test_cost: test_monitor = DataStreamMonitoring( variables=[cost_train, cost_per_character], data_stream=test_stream, prefix="test") test_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, test_stream, prefix='test', every_n_epochs=1, phoneme_dict=phoneme_dict, black_list=black_list) extensions.append(test_monitor) extensions.append(test_ctc_monitor) #if not os.path.exists(experiment_path): # os.makedirs(experiment_path) #best_path = os.path.join(experiment_path, 'best/') #if not os.path.exists(best_path): # os.mkdir(best_path) #best_path = os.path.join(best_path, 'model.bin') extensions.append(EarlyStopping(to_watch, patience, '/dev/null')) extensions.extend([ProgressBar(), Printing()]) # ------------------------------------------------------------------------ # Main Loop main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) print "Building time: %f" % (time.time() - t0) # if write_predictions: # with open('predicted.txt', 'w') as f_pred: # with open('targets.txt', 'w') as f_targets: # evaluator = CTCEvaluator( # eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list) # evaluator.evaluate(dev_stream, file_pred=f_pred, # file_targets=f_targets) # return main_loop.run()
parameters=cg.parameters, step_rule=CompositeRule(step_rules)) from blocks.extensions import Timing, FinishAfter, Printing, ProgressBar from blocks.extensions.monitoring import TrainingDataMonitoring from fuel.streams import DataStream from fuel.schemes import SequentialScheme from blocks.main_loop import MainLoop from blocks.extensions.saveload import Checkpoint from blocks.model import Model main_loop = MainLoop(algorithm=algorithm, data_stream=DataStream.default_stream( dataset=train_dataset, iteration_scheme=SequentialScheme( train_dataset.num_examples, batch_size=10)), model=Model(y_est), extensions=[ Timing(), FinishAfter(after_n_epochs=200), TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True), Printing(), ProgressBar(), Checkpoint(path="./checkpoint.zip") ]) main_loop.run()
def main(config): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') src_vocab = _ensure_special_tokens(pickle.load( open(config['src_vocab'], 'rb')), bos_idx=0, eos_idx=config['src_vocab_size'] - 1, unk_idx=config['unk_id']) trg_vocab = _ensure_special_tokens(pickle.load( open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=config['trg_vocab_size'] - 1, unk_idx=config['unk_id']) target_bos_idx = trg_vocab[config['bos_token']] target_space_idx = trg_vocab[' '] logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) # Set up model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") # Reload model if necessary extensions = [LoadNMT(config['saveto'])] # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=None, data_stream=None, extensions=extensions) for extension in main_loop.extensions: extension.main_loop = main_loop main_loop._run_extensions('before_training') char_embedding = encoder.decimator.apply(source_char_seq.T, source_sample_matrix, source_char_aux.T) embedding(Model(char_embedding), src_vocab)
on_unused_sources='warn') # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() training_cost = create_model(train_encoder, train_decoder) # Set up training model logger.info("Building model") train_model = Model(training_cost) dev_stream = get_dev_stream(**exp_config) main(train_model, training_cost, exp_config, masked_stream, dev_stream=dev_stream, use_bokeh=True)
def get_confidence_function(exp_config): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target_suffix') target_sentence_mask = tensor.matrix('target_suffix_mask') target_prefix = tensor.lmatrix('target_prefix') target_prefix_mask = tensor.matrix('target_prefix_mask') logger.info('Creating computational graph') # build the model encoder = BidirectionalEncoder(exp_config['src_vocab_size'], exp_config['enc_embed'], exp_config['enc_nhids']) # Note: the 'min_risk' kwarg tells the decoder which sequence_generator and cost_function to use decoder = NMTPrefixDecoder(exp_config['trg_vocab_size'], exp_config['dec_embed'], exp_config['dec_nhids'], exp_config['enc_nhids'] * 2, loss_function='cross_entropy') # rename to match baseline NMT systems decoder.name = 'decoder' predictions, merged_states = decoder.prediction_tags( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, target_prefix, target_prefix_mask) # WORKING: also get the softmax prediction feature # WORKING: add features for source len, prefix len, position in suffix (position in suffix only makes sense if we're training on predictions) p_shape = predictions.shape predictions = predictions.reshape([p_shape[0] * p_shape[1], p_shape[2]]) prediction_softmax = tensor.nnet.nnet.softmax( predictions.reshape([p_shape[0] * p_shape[1], p_shape[2]])).reshape(p_shape) prediction_feature = prediction_softmax.max(axis=-1)[:, :, None] all_features = tensor.concatenate([merged_states, prediction_feature], axis=-1) confidence_output = decoder.sequence_generator.confidence_predictions( all_features) logger.info('Creating computational graph') confidence_model = Model(confidence_output) # Note that the parameters of this model must be pretrained, otherwise this doesn't make sense param_values = LoadNMT.load_parameter_values( exp_config['saved_parameters'], brick_delimiter=None) LoadNMT.set_model_parameters(confidence_model, param_values) confidence_param_values = LoadNMT.load_parameter_values( exp_config['confidence_saved_parameters'], brick_delimiter=None) LoadNMT.set_model_parameters(confidence_model, confidence_param_values) confidence_function = confidence_model.get_theano_function() return confidence_function
x = x.swapaxes(0, 1) y = y.swapaxes(0, 1) in_size = num_features out_size = num_features y_hat, cost, cells = nn_fprop(x, y, in_size, out_size, hidden_size[network_mode], num_layers, layer_models[network_mode][0], 'MDN', training=False) main_loop = MainLoop(algorithm=None, data_stream=None, model=Model(cost), extensions=[saveload.Load(save_path[network_mode])]) for extension in main_loop.extensions: extension.main_loop = main_loop main_loop._run_extensions('before_training') bin_model = main_loop.model print 'Model loaded. Building prediction function...' hiddens = [] initials = [] for i in range(num_layers): brick = [ b for b in bin_model.get_top_bricks() if b.name == layer_models[network_mode][i] + str(i) + '-' ][0] hiddens.extend(
def main(config, tr_stream, dev_stream, source_vocab, target_vocab, use_bokeh=False): # add the tags from this function to the IMT datastream # prediction function signature # [target_suffix, source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask] prediction_function = get_prediction_function(exp_config=config) tr_stream = Mapping( tr_stream, CallPredictionFunctionOnStream(prediction_function, [1, 0, 5, 4, 7, 6]), #tr_stream = Mapping(tr_stream, CallFunctionOnStream(prediction_function, [6, 1, 0, 5, 4, 7]), add_sources=('predictions', 'orig_readouts', 'prediction_tags')) # now datastream has 11 things import ipdb ipdb.set_trace() # WORKING: call prediction function twice to get new readouts on predictions instead of reference suffs # the only difference is the index of the suffix tr_stream = Mapping(tr_stream, CallPredictionFunctionOnStream(prediction_function, [1, 0, 5, 4, 7, 8]), add_sources=('dummy_predictions', 'readouts', 'dummy_prediction_tags')) import ipdb ipdb.set_trace() # Create the prediction confidence model # the first draft of this model uses the readout output (before the post-merge step) as the per-timestep state vector # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference target_sentence = tensor.lmatrix('target_suffix') target_sentence_mask = tensor.matrix('target_suffix_mask') target_prefix = tensor.lmatrix('target_prefix') target_prefix_mask = tensor.matrix('target_prefix_mask') # symbolic variable which tags each timestep as GOOD/BAD # Note: later this might be tags for a hypothesis i.e. from TER(p), right now the timesteps are actually determined by the reference # By zipping the confidence model output with the reference, we get the model's confidence that this reference word # will be predicted correctly prediction_tags = tensor.matrix('prediction_tags') readouts = tensor.tensor3('readouts') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = NMTPrefixDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, loss_function='cross_entropy') # rename to match baseline NMT systems decoder.name = 'decoder' cost = decoder.confidence_cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, target_prefix, target_prefix_mask, readouts, prediction_tags) # WORKING: add l2 regularization logger.info('Creating computational graph') # working: implement cost for confidence model cg = ComputationGraph(cost) # INITIALIZATION logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() #cost_cg = ComputationGraph(cost) if config['l2_reg']: l2_reg_alpha = config['l2_reg_alpha'] model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # do we need to name the cost variable again? cost.name = 'cost' cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name in set([ 'confidence_model1_apply_output', 'confidence_model2_apply_output', 'confidence_model3_apply_output' ]) ] # if x.name == 'maxout_apply_output'] # if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # WORKING: implement confidence -- remove all params except output model cost_model = Model(cost) model_params = cost_model.get_parameter_dict() trainable_params = cg.parameters import ipdb ipdb.set_trace() print('trainable params') #params_to_remove = [model_params[k] for k in model_params.keys() if 'confidence' not in k] #for p in params_to_remove: # trainable_params.remove(p) # target_embeddings = model.get_parameter_dict()['/target_recurrent_lm_with_alignments/target_embeddings.W'] # trainable_params.remove(source_embeddings) # trainable_params.remove(target_embeddings) # END WORKING: implement confidence -- remove all params except output model # TODO: fixed dropout mask for recurrent params? # Print shapes # shapes = [param.get_value().shape for param in cg.parameters] # logger.info("Parameter shapes: ") # for shape, count in Counter(shapes).most_common(): # logger.info(' {:15}: {}'.format(shape, count)) # logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names # enc_dec_param_dict = merge(Selector(encoder).get_parameters(), # Selector(decoder).get_parameters()) # logger.info("Parameter names: ") # for name, value in enc_dec_param_dict.items(): # logger.info(' {:15}: {}'.format(value.get_value().shape, name)) # logger.info("Total number of parameters: {}" # .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), # Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # WORKING: confidence prediction #monitor everything that could possibly be relevant # Set up the sampling graph for validation during training # Theano variables for the sampling graph # Note this also loads the model parameters sampling_vars = load_params_and_get_beam_search(config, encoder=encoder, decoder=decoder) beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab=source_vocab, # trg_vocab=target_vocab, # src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu #if config['bleu_script'] is not None: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, sampling_prefix, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # src_vocab=source_vocab, # trg_vocab=target_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # TODO: add first-word accuracy validation # TODO: add IMT meteor early stopping #if config.get('imt_f1_validation', None) is not None: # logger.info("Building imt F1 validator") # extensions.append( # IMT_F1_Validator(sampling_input, sampling_prefix, # samples=samples, # config=config, # model=search_model, data_stream=dev_stream, # src_vocab=source_vocab, # trg_vocab=target_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # TODO: hacking here: get the predictions of the confidence model using the `readouts` source of the data_stream # Note that the parameters of this model must be pretrained, otherwise this doesn't make sense # confidence_predictions = decoder.get_confidence(readouts) # confidence_prediction_model = Model(confidence_predictions) # # confidence_param_values = LoadNMT.load_parameter_values(config['confidence_saved_parameters'], brick_delimiter=None) # LoadNMT.set_model_parameters(confidence_prediction_model, confidence_param_values) # # confidence_prediction_func = confidence_prediction_model.get_theano_function() # import ipdb; ipdb.set_trace() # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( # Plot(config['model_save_directory'], channels=[['decoder_confidence_cost_cost']], Plot(config['model_save_directory'], channels=[['cost']], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # WORKING: implement confidence model # if there is dropout or random noise, we need to use the output of the modified graph algorithm = GradientDescent( cost=cg.outputs[0], parameters=trainable_params, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), # eval(config['step_rule'])(), RemoveNotFinite()]), # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), on_unused_sources='warn') #if config['dropout'] < 1.0: # algorithm = GradientDescent( # cost=cg.outputs[0], parameters=trainable_params, # step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])(), RemoveNotFinite()]), # # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), # on_unused_sources='warn' # ) #else: # algorithm = GradientDescent( # cost=cost, parameters=cg.parameters, # step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])()]), # on_unused_sources='warn' # ) # END WORKING: implement confidence model import ipdb ipdb.set_trace() # enrich the logged information extensions.append(Timing(every_n_batches=100)) # WORKING: debugging confidence # get theano function from model # WORKING: implement word-level confidence cost # @application(inputs=['representation', 'source_sentence_mask', # 'target_sentence_mask', 'target_sentence', 'target_prefix_mask', 'target_prefix'], # outputs=['cost']) # def confidence_cost(self, representation, source_sentence_mask, # target_sentence, target_sentence_mask, target_prefix, target_prefix_mask): logger.info('Creating theano variables') # WORKING: 26.9.16 -- get confidence outputs directly from (source, prefix, suffix) inputs # This is equivalent to forced alignment --> confidence scores # Note: but this section should probably be in "evaluate" mode, not here in "train" # source_sentence = tensor.lmatrix('source') # source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference #target_sentence = tensor.lmatrix('target_suffix') #target_sentence_mask = tensor.matrix('target_suffix_mask') # TODO: change names back to *_suffix, there is currently a theano function name error # TODO: in the GradientDescent Algorithm #target_prefix = tensor.lmatrix('target_prefix') #target_prefix_mask = tensor.matrix('target_prefix_mask') # confidence_output = decoder.confidence_cost( # encoder.apply(source_sentence, source_sentence_mask), # source_sentence_mask, target_sentence, target_sentence_mask, # target_prefix, target_prefix_mask) # confidence_model = Model(confidence_output) # t_cost_func = confidence_model.get_theano_function() # inputs # [source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask, target_suffix] #import ipdb;ipdb.set_trace() # get the right args from the datastream # TODO: just print source, prefix, suffix, prediction, correct to new files -- this makes sure everything is aligned # OUTPUT_DIR = '/media/1tb_drive/imt_models/word_prediction_accuracy_experiments/en-de/exp_1' # for the_file in os.listdir(OUTPUT_DIR): # file_path = os.path.join(OUTPUT_DIR, the_file) # try: # if os.path.isfile(file_path): # os.unlink(file_path) # except Exception as e: # print(e) # # def write_file_truncate_mask(filename, data, mask, mode='a'): # ''' data is list of list ''' # # assert len(data) == len(mask) # with codecs.open(filename, mode, encoding='utf8') as out: # for l, m in zip(data, mask): # output = u' '.join(l[:int(m.sum())]) + u'\n' # out.write(output) # logger.info('Wrote file: {}'.format(filename)) # # # target_ivocab = {k:v.decode('utf8') for v,k in target_vocab.items()} # source_ivocab = {k:v.decode('utf8') for v,k in source_vocab.items()} # import ipdb; ipdb.set_trace() # tag_ivocab = {1: 'True', 0: 'False'} # # test_iter = tr_stream.get_epoch_iterator() # it = 0 # for t_source, t_source_mask, t_target, t_target_mask, t_target_prefix, t_target_prefix_mask, t_target_suffix, t_target_suffix_mask in test_iter: # if it <= 1000: # it += 1 # t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix) # readouts = t_cost[0] # preds = readouts.argmax(axis=2) # correct = preds.T == t_target_suffix # # # source_output = os.path.join(OUTPUT_DIR,'sources.en') # prefix_output = os.path.join(OUTPUT_DIR,'prefixes.de') # suffix_output = os.path.join(OUTPUT_DIR,'suffixes.de') # prediction_output = os.path.join(OUTPUT_DIR,'predictions.de') # correct_output = os.path.join(OUTPUT_DIR,'prefix_word_prediction_acc.out') # # source_text = [[source_ivocab[w] for w in s] for s in t_source] # prefix_text = [[target_ivocab[w] for w in s] for s in t_target_prefix] # suffix_text = [[target_ivocab[w] for w in s] for s in t_target_suffix] # pred_text = [[target_ivocab[w] for w in s] for s in preds.T] # correct_text = [[tag_ivocab[w] for w in s] for s in correct] # # # for triple in zip([source_output, prefix_output, suffix_output, prediction_output, correct_output], # [source_text, prefix_text, suffix_text, pred_text, correct_text], # [t_source_mask, t_target_prefix_mask, t_target_suffix_mask, t_target_suffix_mask, t_target_suffix_mask]): # write_file_truncate_mask(*triple) # else: # break # # import ipdb; ipdb.set_trace() #t_cost = t_cost_func(t_source, t_target_prefix) #t_cost = t_cost_func(t_target_suffix, t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask) #t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix) # return confidence_cost, flat_y, confidence_logits, readouts #predictions = t_cost[0].argmax(axis=2) # TODO: next step -- print gradients and weights during training find out where nan is coming from # TODO: look at the gradient of this function with respect to parameters? -- see here: http://deeplearning.net/software/theano/tutorial/gradients.html # TODO: function which adds right/wrong tags for model predictions to the datastream. In this case we can learn a simple linear model as a baseline # TODO: print predictions for each batch for each timestep to file -- _dont shuffle_ so that we get the right order # import ipdb;ipdb.set_trace() # from blocks reverse_words example # observables = [ # cost, min_energy, max_energy, mean_activation, # batch_size, max_length, cost_per_character, # algorithm.total_step_norm, algorithm.total_gradient_norm] # for name, parameter in trainable_params.items(): # observables.append(parameter.norm(2).copy(name + "_norm")) # observables.append(algorithm.gradients[parameter].norm(2).copy( # name + "_grad_norm")) for i, (k, v) in enumerate(algorithm.updates): v.name = k.name + '_{}'.format(i) aux_vars = [v for v in cg.auxiliary_variables[-3:]] # import ipdb; ipdb.set_trace() extensions.extend([ TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring([v for k,v in algorithm.updates[:2]], after_batch=True), # TrainingDataMonitoring(aux_vars, after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), Printing(after_batch=True) ]) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) import ipdb ipdb.set_trace() # Train! main_loop.run()
def main(argv): name = argv[1] files = map(lambda p: join(folder, p), listdir(folder)) file = next(filter(lambda n: name in n, files)) print(file) p = load_parameter_values(file) net = net_dvc((128, 128)) x = tensor.tensor4('image_features') y_hat = net.apply(x) g = Model(y_hat) for k, v in p.items(): p[k] = v.astype('float32') g.set_parameter_values(p) a, t, v = get_dvc((128, 128), trainning=False, shortcut=False) run = function([x], y_hat) def run_test(data): res = [] for i in data.get_epoch_iterator(): res.extend(run(i[0])) return res def max_index(l): if l[0] > l[1]: return 0 else: return 1 def write_kaggle(f, l): f.write("id,label\n") for i, e in enumerate(l, start=1): f.write(str(i) + "," + str(e) + "\n") def kaggle(file, data): write_kaggle(file, map(max_index, run_test(data))) def accuracy(data): res = [] true = [] for i in data.get_epoch_iterator(): res.extend(run(i[0])) true.extend(i[1]) res = map(max_index, res) total = 0 equal = 0 for r, t in zip(res, true): total += 1 equal += 1 if r == t else 0 return equal / total print("Training accuracy: ", accuracy(a)) print("Test accuracy: ", accuracy(v)) kaggle_file = join(result_folder, name + ".kaggle") print(kaggle_file) with open(kaggle_file, 'w') as f: kaggle(f, t)
def evaluate_extractive_qa(config, tar_path, part, num_examples, dest_path, qids=None, dataset=None): if not dest_path: dest_path = os.path.join(os.path.dirname(tar_path), 'predictions.json') log_path = os.path.splitext(dest_path)[0] + '_log.json' if qids: qids = qids.split(',') if dataset: dataset = SQuADDataset(dataset, ('all', )) c = config data, qam = initialize_data_and_model(c) costs = qam.apply_with_default_vars() cg = Model(costs) with open(tar_path) as src: cg.set_parameter_values(load_parameters(src)) predicted_begins, = VariableFilter(name='predicted_begins')(cg) predicted_ends, = VariableFilter(name='predicted_ends')(cg) compute = {'begins': predicted_begins, 'ends': predicted_ends} if c['coattention']: d2q_att_weights, = VariableFilter(name='d2q_att_weights')(cg) q2d_att_weights, = VariableFilter(name='q2d_att_weights')(cg) compute.update({'d2q': d2q_att_weights, 'q2d': q2d_att_weights}) compute['costs'] = costs predict_func = theano.function(qam.input_vars.values(), compute) logger.debug("Ready to evaluate") done_examples = 0 num_correct = 0 def print_stats(): print('EXACT MATCH RATIO: {}'.format(num_correct / float(done_examples))) predictions = {} log = {} stream = data.get_stream(part, batch_size=1, shuffle=part == 'train', raw_text=True, q_ids=True, dataset=dataset) for example in stream.get_epoch_iterator(as_dict=True): if done_examples == num_examples: break q_id = vec2str(example['q_ids'][0]) if qids and not q_id in qids: continue example['contexts_text'] = [map(vec2str, example['contexts_text'][0])] example['questions_text'] = [ map(vec2str, example['questions_text'][0]) ] feed = dict(example) del feed['q_ids'] del feed['contexts_text'] del feed['questions_text'] del feed['contexts_text_mask'] result = predict_func(**feed) correct_answer_span = slice(example['answer_begins'][0], example['answer_ends'][0]) predicted_answer_span = slice(result['begins'][0], result['ends'][0]) correct_answer = example['contexts_text'][0][correct_answer_span] answer = example['contexts_text'][0][predicted_answer_span] is_correct = correct_answer_span == predicted_answer_span context = example['contexts_text'][0] question = example['questions_text'][0] context_def_map = example['contexts_def_map'] # pretty print outcome = 'correct' if is_correct else 'wrong' print('#{}'.format(done_examples)) print(u"CONTEXT:", detokenize(context)) print(u"QUESTION:", detokenize(question)) print(u"RIGHT ANSWER: {}".format(detokenize(correct_answer))) print( u"ANSWER (span=[{}, {}], {}):".format(predicted_answer_span.start, predicted_answer_span.stop, outcome), detokenize(answer)) print(u"COST: {}".format(float(result['costs'][0]))) print(u"DEFINITIONS AVAILABLE FOR:") for pos in set(context_def_map[:, 1]): print(context[pos]) print() # update statistics done_examples += 1 num_correct += is_correct # save the results predictions[q_id] = detokenize(answer) log_entry = { 'context': context, 'question': question, 'answer': answer, 'correct_answer': correct_answer, 'cost': float(result['costs'][0]) } if c['coattention']: log_entry['d2q'] = cPickle.dumps(result['d2q'][0]) log_entry['q2d'] = cPickle.dumps(result['q2d'][0]) log[q_id] = log_entry if done_examples % 100 == 0: print_stats() print_stats() with open(log_path, 'w') as dst: json.dump(log, dst, indent=2, sort_keys=True) with open(dest_path, 'w') as dst: json.dump(predictions, dst, indent=2, sort_keys=True)