def __init__(self, exp_config): self.beam_search, self.sampling_input = load_params_and_get_beam_search( exp_config) self.exp_config = exp_config # how many hyps should be output (only used in file prediction mode) self.n_best = exp_config.get('n_best', 1) self.source_lang = exp_config.get('source_lang', 'en') self.target_lang = exp_config.get('target_lang', 'es') tokenize_script = exp_config.get('tokenize_script', None) detokenize_script = exp_config.get('detokenize_script', None) if tokenize_script is not None and detokenize_script is not None: self.tokenizer_cmd = [ tokenize_script, '-l', self.source_lang, '-q', '-', '-no-escape', '1' ] self.detokenizer_cmd = [ detokenize_script, '-l', self.target_lang, '-q', '-' ] else: self.tokenizer_cmd = None self.detokenizer_cmd = None # this index will get overwritten with the EOS token by _ensure_special_tokens # IMPORTANT: the index must be created in the same way it was for training, # otherwise the predicted indices will be nonsense # Make sure that src_vocab_size and trg_vocab_size are correct in your configuration self.src_eos_idx = exp_config['src_vocab_size'] - 1 self.trg_eos_idx = exp_config['trg_vocab_size'] - 1 self.unk_idx = exp_config['unk_id'] # Get vocabularies and inverse indices # Note: _ensure_special_tokens will _overwrite_ anything at bos_idx, src_idx, and unk_idx self.src_vocab = _ensure_special_tokens(pickle.load( open(exp_config['src_vocab'])), bos_idx=0, eos_idx=self.src_eos_idx, unk_idx=self.unk_idx) self.src_ivocab = {v: k for k, v in self.src_vocab.items()} self.trg_vocab = _ensure_special_tokens(pickle.load( open(exp_config['trg_vocab'])), bos_idx=0, eos_idx=self.trg_eos_idx, unk_idx=self.unk_idx) self.trg_ivocab = {v: k for k, v in self.trg_vocab.items()} self.unk_idx = self.unk_idx
def get_dev_stream_with_context_features(val_context_features=None, val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" def _get_np_array(filename): return numpy.load(filename)['arr_0'] dev_stream = None if val_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, None) # now add the source with the image features # create the image datastream (iterate over a file line-by-line) con_features = _get_np_array(val_context_features) con_feature_dataset = IterableDataset(con_features) valid_image_stream = DataStream(con_feature_dataset) # dev_stream = DataStream(dev_dataset) dev_stream = Merge([dev_dataset.get_example_stream(), valid_image_stream], ('source', 'initial_context')) # dev_stream = dev_stream.get_example_stream() return dev_stream
def get_dev_stream_with_prefix_file(val_set=None, val_set_grndtruth=None, val_set_prefixes=None, val_set_suffixes=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs): """Setup development stream with user-provided source, target, prefixes, and suffixes""" dev_stream = None if val_set is not None and val_set_grndtruth is not None and val_set_prefixes is not None and val_set_suffixes is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Note: user should have already provided the EOS token in the data representation for the suffix # Note: The reason that we need EOS tokens in the reference file is that IMT systems need to evaluate metrics # Note: which count prediction of the </S> token, and evaluation scripts are called on the files dev_source_dataset = TextFile([val_set], src_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_prefix_dataset = TextFile([val_set_prefixes], trg_vocab, bos_token='<S>', eos_token=None, unk_token='<UNK>') dev_suffix_dataset = TextFile([val_set_suffixes], trg_vocab, bos_token=None, eos_token=None, unk_token='<UNK>') dev_stream = Merge([dev_source_dataset.get_example_stream(), dev_target_dataset.get_example_stream(), dev_prefix_dataset.get_example_stream(), dev_suffix_dataset.get_example_stream()], ('source', 'target','target_prefix','target_suffix')) if return_vocab: return dev_stream, src_vocab, trg_vocab else: return dev_stream
def get_dev_stream_with_prefixes(val_set=None, val_set_grndtruth=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None and val_set_grndtruth is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) dev_source_dataset = TextFile([val_set], src_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_stream = Merge([dev_source_dataset.get_example_stream(), dev_target_dataset.get_example_stream()], ('source', 'target')) # now add prefix and suffixes to this stream dev_stream = Mapping(dev_stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('dev_sample_ratio', 1.)), add_sources=('target_prefix', 'target_suffix')) dev_stream = Mapping(dev_stream, CopySourceAndTargetToMatchPrefixes(dev_stream)) # changing stream.produces_examples is a little hack which lets us use Unpack to flatten dev_stream.produces_examples = False # flatten the stream back out into (source, target, target_prefix, target_suffix) dev_stream = Unpack(dev_stream) if return_vocab: return dev_stream, src_vocab, trg_vocab else: return dev_stream
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Get training and development set streams tr_stream = get_tr_stream(**config) dev_stream = get_dev_stream(**config) # Get cost of the model cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get test set stream test_stream = get_dev_stream( config['test_set'], config['src_vocab'], config['src_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens( pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk( line[0], config['src_vocab_size'], unk_idx) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={sampling_input: input_}, max_length=3*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close()
logger.info("Creating Sampling Model...") sampling_model = Model(generated) # TODO: update clients with sampling_context_input return sampling_model, sampling_source_input, sampling_context_input, encoder, decoder sample_model, theano_sampling_source_input, theano_sampling_context_input, train_encoder, train_decoder = \ get_sampling_model_and_input(exp_config) trg_vocab = cPickle.load(open(exp_config['trg_vocab'])) trg_vocab_size = exp_config['trg_vocab_size'] - 1 src_vocab = cPickle.load(open(exp_config['src_vocab'])) src_vocab_size = exp_config['src_vocab_size'] - 1 src_vocab = _ensure_special_tokens(src_vocab, bos_idx=0, eos_idx=src_vocab_size, unk_idx=exp_config['unk_id']) trg_vocab = _ensure_special_tokens(trg_vocab, bos_idx=0, eos_idx=trg_vocab_size, unk_idx=exp_config['unk_id']) theano_sample_func = sample_model.get_theano_function() sampling_func = SampleFunc(theano_sample_func, trg_vocab) src_stream = get_textfile_stream(source_file=exp_config['src_data'], src_vocab=exp_config['src_vocab'], src_vocab_size=exp_config['src_vocab_size']) trg_stream = get_textfile_stream(source_file=exp_config['trg_data'], src_vocab=exp_config['trg_vocab'],
def __init__(self, exp_config): theano_variables = load_params_and_get_beam_search( exp_config, brick_delimiter=exp_config.get('brick_delimiter', None)) # beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars self.beam_search, search_model, samples, self.source_sampling_input, self.target_sampling_input = theano_variables self.exp_config = exp_config # how many hyps should be output (only used in file prediction mode) self.n_best = exp_config.get('n_best', 1) self.source_lang = exp_config.get('source_lang', 'en') self.target_lang = exp_config.get('target_lang', 'es') # persistent tokenizers and detokenizers tokenize_script = exp_config.get('tokenize_script', None) detokenize_script = exp_config.get('detokenize_script', None) if tokenize_script is not None and detokenize_script is not None: # Note: the '-b' option is _essential_ here, otherwise the tokenizer will hang forever self.source_tokenizer_cmd = [ tokenize_script, '-l', self.source_lang, '-q', '-', '-b', '-no-escape', '1' ] self.target_tokenizer_cmd = [ tokenize_script, '-l', self.target_lang, '-q', '-', '-b', '-no-escape', '1' ] self.detokenizer_cmd = [ detokenize_script, '-l', self.target_lang, '-q', '-', '-b' ] self.source_tokenizer = Popen(self.source_tokenizer_cmd, stdin=PIPE, stdout=PIPE, bufsize=1) self.target_tokenizer = Popen(self.target_tokenizer_cmd, stdin=PIPE, stdout=PIPE, bufsize=1) self.target_detokenizer = Popen(self.detokenizer_cmd, stdin=PIPE, stdout=PIPE, bufsize=1) else: self.source_tokenizer_cmd = None self.target_tokenizer_cmd = None self.detokenizer_cmd = None # persistent subword encoding subword_codes = exp_config.get('subword_codes', None) self.BPE = None if subword_codes is not None: from lib.apply_bpe import BPE self.BPE = BPE(codecs.open(subword_codes, encoding='utf8')) # the maximum length of predictions -- this can be shortened to make prediction more efficient self.max_length = exp_config.get('n_steps', None) # this index will get overwritten with the EOS token by _ensure_special_tokens # IMPORTANT: the index must be created in the same way it was for training, # otherwise the predicted indices will be nonsense # Make sure that src_vocab_size and trg_vocab_size are correct in your configuration self.src_eos_idx = exp_config['src_vocab_size'] - 1 self.trg_eos_idx = exp_config['trg_vocab_size'] - 1 self.unk_idx = exp_config['unk_id'] # Get vocabularies and inverse indices self.src_vocab = _ensure_special_tokens(pickle.load( open(exp_config['src_vocab'])), bos_idx=0, eos_idx=self.src_eos_idx, unk_idx=self.unk_idx) self.src_ivocab = {v: k for k, v in self.src_vocab.items()} self.trg_vocab = _ensure_special_tokens(pickle.load( open(exp_config['trg_vocab'])), bos_idx=0, eos_idx=self.trg_eos_idx, unk_idx=self.unk_idx) self.trg_ivocab = {v: k for k, v in self.trg_vocab.items()} self.unk_idx = self.unk_idx
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Get training and development set streams tr_stream = get_tr_stream(**config) dev_stream = get_dev_stream(**config) # Get cost of the model cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get test set stream test_stream = get_dev_stream(config['test_set'], config['src_vocab'], config['src_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens(pickle.load( open(config['trg_vocab'])), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={sampling_input: input_}, max_length=3*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info("Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close()
def get_tr_stream_with_context_features(src_vocab, trg_vocab, src_data, trg_data, context_features, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" def _get_np_array(filename): return numpy.load(filename)['arr_0'] # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # now add the source with the image features # create the image datastream (iterate over a file line-by-line) train_features = _get_np_array(context_features) train_feature_dataset = IterableDataset(train_features) train_image_stream = DataStream(train_feature_dataset) stream = Merge([stream, train_image_stream], ('source', 'target', 'initial_context')) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1], mask_sources=('source', 'target')) return masked_stream, src_vocab, trg_vocab
def get_tr_stream_with_prefixes(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the IMT training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # TODO: should training stream actually have begin and end tokens? # Note: this actually depends upon how the system was pre-trained, but systems used for initialization # Note: should _always_ have BOS tokens # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') trg_dataset = TextFile([trg_data], trg_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token # TODO: doesn't the TextFile stream do this anyway? stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) stream = Mapping(stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('train_sample_ratio', 1.)), add_sources=('target_prefix', 'target_suffix')) stream = Mapping(stream, CopySourceAndTargetToMatchPrefixes(stream)) # changing stream.produces_examples is a little hack which lets us use Unpack to flatten stream.produces_examples = False # flatten the stream back out into (source, target, target_prefix, target_suffix) stream = Unpack(stream) # Now make a very big batch that we can shuffle shuffle_batch_size = kwargs['shuffle_batch_size'] stream = Batch(stream, iteration_scheme=ConstantScheme(shuffle_batch_size) ) stream = ShuffleBatchTransformer(stream) # unpack it again stream = Unpack(stream) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches) ) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short # TODO: is it correct to blindly pad the target_prefix and the target_suffix? configurable_padding_args = { 'suffix_length': kwargs.get('suffix_length', None), 'truncate_sources': kwargs.get('truncate_sources', []) } logger.info('Training suffix length is: {}'.format(configurable_padding_args['suffix_length'])) logger.info('I will mask the following sources after <suffix_length>: {}'.format(configurable_padding_args['truncate_sources'])) masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1, trg_vocab_size - 1], mask_sources=('source', 'target', 'target_prefix', 'target_suffix'), **configurable_padding_args) return masked_stream, src_vocab, trg_vocab