def test_stream(): # Dummy vocabulary vocab = {'<S>': 0, '</S>': 1, '<UNK>': 2} with tempfile.NamedTemporaryFile() as src_data: with tempfile.NamedTemporaryFile() as trg_data: get_tr_stream( src_vocab=vocab, trg_vocab=vocab, src_data=src_data.name, trg_data=trg_data.name) with tempfile.NamedTemporaryFile() as val_set: get_dev_stream(val_set=val_set.name, src_vocab=vocab)
Machine Translation by Jointly Learning to Align and Translate. """ import argparse import logging import pprint import configurations from machine_translation import main from machine_translation.stream import get_tr_stream, get_dev_stream logger = logging.getLogger(__name__) # Get the arguments parser = argparse.ArgumentParser() parser.add_argument("--proto", default="get_config_en2zh", help="Prototype config to use for config") parser.add_argument("--bokeh", default=False, action="store_true", help="Use bokeh server for plotting") args = parser.parse_args() if __name__ == "__main__": # Get configurations for model configuration = getattr(configurations, args.proto)() logger.info("Model options:\n{}".format(pprint.pformat(configuration))) # Get data streams and call main main(configuration, get_tr_stream(**configuration), get_dev_stream(**configuration), args.bokeh)
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Get training and development set streams tr_stream = get_tr_stream(**config) dev_stream = get_dev_stream(**config) # Get cost of the model cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get test set stream test_stream = get_dev_stream( config['test_set'], config['src_vocab'], config['src_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens( pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk( line[0], config['src_vocab_size'], unk_idx) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={sampling_input: input_}, max_length=3*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close()
def run(mode, config_obj, bokeh): if mode == 'train': # Get data streams and call main train_stream, src_vocab, trg_vocab = get_tr_stream(**config_obj) dev_stream = get_dev_stream(**config_obj) main(config_obj, train_stream, dev_stream, bokeh, src_vocab=src_vocab, trg_vocab=trg_vocab) elif mode == 'predict': predictor = NMTPredictor(config_obj) predictor.predict_file(config_obj['test_set'], config_obj.get('translated_output_file', None)) # TODO: let user configure which evaluation metrics to use elif mode == 'evaluate': logger.info("Started Evaluation: ") model_name = config_obj.get('model_name', 'default_model') # TODO: we need a way to keep track of the evaluations from all models, but they are running async evaluation_report_path = os.path.join(config_obj['saveto'], 'evaluation_reports') # load existing evaluation info if this model has already been evaluated if not os.path.isdir(evaluation_report_path): os.makedirs(evaluation_report_path) val_start_time = time.time() evaluation_report = [] # translate if necessary, write output file, call external evaluation tools and show output translated_output_file = config_obj.get('translated_output_file', None) if translated_output_file is not None and os.path.isfile( translated_output_file): logger.info( '{} already exists, so I\'m evaluating the BLEU score of this file with respect to the ' + 'reference that you provided: {}'.format( translated_output_file, config_obj['test_gold_refs'])) else: predictor = NMTPredictor(config_obj) logger.info('Translating: {}'.format(config_obj['test_set'])) translated_output_file = predictor.predict_file( config_obj['test_set'], translated_output_file) logger.info('Translated: {}, output was written to: {}'.format( config_obj['test_set'], translated_output_file)) # If this is a subword system, and user asked for normalization, do it if config_obj.get('normalize_subwords', False): with codecs.open(translated_output_file, encoding='utf8') as output: lines = output.readlines() with codecs.open(translated_output_file, 'w', encoding='utf8') as output: for line in lines: # sed "s/@@ //g" output.write(re.sub(r'@@ ', '', line)) # if user wants BOS and/or EOS tokens cut off, do it if config_obj.get('remove_bos', False): with codecs.open(translated_output_file, encoding='utf8') as output: lines = output.readlines() with codecs.open(translated_output_file, 'w', encoding='utf8') as output: for line in lines: output.write( re.sub(r'^' + config_obj['bos_token'] + ' ', '', line)) if config_obj.get('remove_eos', False): with codecs.open(translated_output_file, encoding='utf8') as output: lines = output.readlines() with codecs.open(translated_output_file, 'w', encoding='utf8') as output: for line in lines: output.write(re.sub(config_obj['eos_token'], '', line)) # BLEU # get gold refs lowercase = config_obj.get('lowercase_bleu', False) if lowercase: logger.info('BLEU will be evaluated in lowercase mode') multibleu_cmd = [ 'perl', config_obj['bleu_script'], '-lc', config_obj['test_gold_refs'], '<' ] else: logger.info('BLEU will be evaluated in case-sensitive mode') multibleu_cmd = [ 'perl', config_obj['bleu_script'], config_obj['test_gold_refs'], '<' ] mb_subprocess = Popen(multibleu_cmd, stdin=PIPE, stdout=PIPE) with codecs.open(translated_output_file, encoding='utf8') as hyps: for l in hyps.read().strip().split('\n'): # send the line to the BLEU script print(l.encode('utf8'), file=mb_subprocess.stdin) mb_subprocess.stdin.flush() # send end of file, read output. mb_subprocess.stdin.close() stdout = mb_subprocess.stdout.readline() logger.info(stdout) out_parse = re.match(r'BLEU = [-.0-9]+', stdout) logger.info("Validation Took: {} minutes".format( float(time.time() - val_start_time) / 60.)) assert out_parse is not None # extract the score bleu_score = float(out_parse.group()[6:]) logger.info('BLEU SCORE: {}'.format(bleu_score)) mb_subprocess.terminate() evaluation_report.append(u'{} {} {}'.format( 'bleu', bleu_score, model_name)) # Meteor meteor_directory = config_obj.get('meteor_directory', None) if meteor_directory is not None: target_language = config_obj.get('target_lang', 'de') # java -Xmx2G -jar meteor-*.jar test reference - l en - norm # Note: not using the `-norm` parameter with METEOR since the references are already tokenized meteor_cmd = [ 'java', '-Xmx4G', '-jar', os.path.join(meteor_directory, 'meteor-1.5.jar'), translated_output_file, config_obj['test_gold_refs'], '-l', target_language, '-norm' ] meteor_output = check_output(meteor_cmd) meteor_score = float( meteor_output.strip().split('\n')[-1].split()[-1]) logger.info('METEOR SCORE: {}'.format(meteor_score)) evaluation_report.append(u'{} {} {}'.format( 'meteor', bleu_score, model_name)) # touch a file for each row in evaluation_report, the file name is the result for l in evaluation_report: open(os.path.join(evaluation_report_path, l), 'w').close() logger.info('Wrote evaluation report files to: {}'.format( evaluation_report_path)) elif mode == 'server': import sys sys.path.append('.') from server import run_nmt_server # start restful server and log its port predictor = NMTPredictor(config_obj) run_nmt_server(predictor) else: print('ERROR: mode unknown: {}'.format(mode))
on_unused_sources='warn') # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() training_cost = create_model(train_encoder, train_decoder) # Set up training model logger.info("Building model") train_model = Model(training_cost) dev_stream = get_dev_stream(**exp_config) main(train_model, training_cost, exp_config, masked_stream, dev_stream=dev_stream, use_bokeh=True)
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Get training and development set streams tr_stream = get_tr_stream(**config) dev_stream = get_dev_stream(**config) # Get cost of the model cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter(bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get test set stream test_stream = get_dev_stream(config['test_set'], config['src_vocab'], config['src_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens(pickle.load( open(config['trg_vocab'])), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={sampling_input: input_}, max_length=3*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info("Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close()
if __name__ == "__main__": print("~__main__") # Get configurations for model configuration = getattr(configurations, args.proto)() print("~configuration") logger.info("Model options:\n{}".format(pprint.pformat(configuration))) print("~logger.info") # Get data streams and call main main(configuration, get_tr_stream(**configuration), get_dev_stream(**configuration), args.bokeh)
bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") dev_stream = get_dev_stream(val_set=config['src_val'], **config) extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, ground_truth=config['trg_val'], normalize=config['normalized_bleu'], val_out=config['val_dev_out'], val_best_out=config['val_best_dev_out'], every_n_batches=config['bleu_val_freq'], bleu_out=config['val_bleu_out'])) if config['bleu_script'] is not None and config['train_val']: # compute bleu score for a small subset of the training data dev_stream = get_dev_stream(val_set=config['src_data_sample'], **config) extensions.append( BleuEvaluator(sampling_input, samples=samples, config=config,