def test_stream(): # Dummy vocabulary vocab = {"<S>": 0, "</S>": 1, "<UNK>": 2} with tempfile.NamedTemporaryFile() as src_data: with tempfile.NamedTemporaryFile() as trg_data: get_tr_stream(src_vocab=vocab, trg_vocab=vocab, src_data=src_data.name, trg_data=trg_data.name) with tempfile.NamedTemporaryFile() as val_set: get_dev_stream(val_set=val_set.name, src_vocab=vocab)
f_init, f_next = trans.build_sample() logger.info('end build sample model : f_init, f_next') src_vocab = pickle.load(open(config['src_vocab'])) trg_vocab = pickle.load(open(config['trg_vocab'])) src_vocab = ensure_special_tokens(src_vocab, bos_idx=0, eos_idx=config['src_vocab_size'] - 1, unk_idx=config['unk_id']) trg_vocab = ensure_special_tokens(trg_vocab, bos_idx=0, eos_idx=config['src_vocab_size'] - 1, unk_idx=config['unk_id']) trg_vocab_reverse = {index: word for word, index in trg_vocab.iteritems()} src_vocab_reverse = {index: word for word, index in src_vocab.iteritems()} logger.info('load dict finished ! src dic size : {} trg dic size : {}.'.format(len(src_vocab), len(trg_vocab))) tr_stream = get_tr_stream(**config) dev_stream = get_dev_stream(**config) logger.info('start training!!!') batch_count = 0 val_time = 0 best_score = 0. for epoch in range(config['max_epoch']): for tr_data in tr_stream.get_epoch_iterator(): batch_count += 1 tr_fn(*tr_data) # sample if batch_count % config['sampling_freq'] == 0: trans_sample(tr_data[0], tr_data[2], f_init, f_next, config['hook_samples'], src_vocab_reverse, trg_vocab_reverse, batch_count)
def init(): parser = argparse.ArgumentParser() parser.add_argument('--proto', default='get_config_cs2en', help='Prototype config') args = parser.parse_args() config = getattr(configurations, args.proto)() main(config, get_tr_stream(**config), get_dev_stream(**config))
def main(): # set para config = getattr(configurations, "get_config_cs2en")() logger.info("Model options:\n{}".format(pprint.pformat(config))) tr_stream = get_tr_stream(**config) # Create Theano variables logger.info("Creating theano variables") source_sentence0 = tensor.lmatrix("source0") source_sentence_mask0 = tensor.matrix("source0_mask") target_sentence0 = tensor.lmatrix("target0") target_sentence_mask0 = tensor.matrix("target0_mask") source_sentence1 = tensor.lmatrix("source1") source_sentence_mask1 = tensor.matrix("source1_mask") target_sentence1 = tensor.lmatrix("target1") target_sentence_mask1 = tensor.matrix("target1_mask") source_sentence2 = tensor.lmatrix("source2") source_sentence_mask2 = tensor.matrix("source2_mask") target_sentence2 = tensor.lmatrix("target2") target_sentence_mask2 = tensor.matrix("target2_mask") sampling_input0 = tensor.lmatrix("input0") sampling_input1 = tensor.lmatrix("input1") sampling_input2 = tensor.lmatrix("input2") sampling_hstates0 = tensor.fmatrix("hstates0") sampling_hstates1 = tensor.fmatrix("hstates1") sampling_hstates2 = tensor.fmatrix("hstates2") sampling_lastrep0 = tensor.tensor3("lastrep0") sampling_lastrep1 = tensor.tensor3("lastrep1") hstates = theano.shared(value=numpy.zeros((config["enc_nhids"]), dtype=theano.config.floatX), name="hstates") # Get vocab sources = get_attr_rec(tr_stream, "data_stream") src_vocab = sources.data_streams[0].dataset.dictionary trg_vocab = sources.data_streams[1].dataset.dictionary # Construct model logger.info("Building PoemModel") block0 = PoemBlock(config=config, blockid="block0", name="poemblock0") block1 = PoemBlock(config=config, blockid="block1", name="poemblock1") block2 = PoemBlock(config=config, blockid="block2", name="poemblock2") cost0, hsta0, rep0 = block0.cost( source_sentence0, source_sentence_mask0, source_sentence_mask1, source_sentence_mask0, target_sentence0, target_sentence_mask0, hstates, lastrep0=None, lastrep1=None, ) cost1, hsta1, rep1 = block1.cost( source_sentence1, source_sentence_mask0, source_sentence_mask1, source_sentence_mask1, target_sentence1, target_sentence_mask1, hsta0, lastrep0=rep0, lastrep1=None, ) cost2, hsta2, rep2 = block2.cost( source_sentence2, source_sentence_mask0, source_sentence_mask1, source_sentence_mask2, target_sentence2, target_sentence_mask2, hsta1, lastrep0=rep0, lastrep1=rep1, ) cost = cost0 + cost1 + cost2 cost.name = "total_cost" logger.info("Creating computational graph") cg = ComputationGraph(cost) # Initialize model logger.info("Initializing model") block0.set_initw(IsotropicGaussian(config["weight_scale"])) block0.set_initb(Constant(0)) block0.push_initialization_config() block0.set_specialinit(Orthogonal(), Orthogonal()) block0.initialize() block1.set_initw(IsotropicGaussian(config["weight_scale"])) block1.set_initb(Constant(0)) block1.push_initialization_config() block1.set_specialinit(Orthogonal(), Orthogonal()) block1.initialize() block2.set_initw(IsotropicGaussian(config["weight_scale"])) block2.set_initb(Constant(0)) block2.push_initialization_config() block2.set_specialinit(Orthogonal(), Orthogonal()) block2.initialize() # apply dropout for regularization if config["dropout"] < 1.0: # dropout is applied to the output of maxout in ghog logger.info("Applying dropout") dropout_inputs = [x for x in cg.intermediary_variables if x.name == "maxout_apply_output"] cg = apply_dropout(cg, dropout_inputs, config["dropout"]) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(" {:15}: {}".format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names param_dict = Selector(block0).get_parameters() logger.info("Parameter names: ") for name, value in param_dict.items(): logger.info(" {:15}: {}".format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format(len(param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # logger.info(cg.auxiliary_variables) # logger.info("______________________________") """ weights = "" for va in cg.auxiliary_variables: if va.name == "sequence_generator_block0_cost_matrix_weighted_averages": weights = va weightsize = weights.shape weightsize.name = "weightsize" states = "" for va in cg.auxiliary_variables: if va.name == "sequence_generator_block0_cost_matrix_states": states = va statesize = states.shape statesize.name = "statesize" rep = "" for va in cg.auxiliary_variables: if va.name == "poemblock0_cost_block0hstatesRepeat": rep = va repsize = rep.shape repsize.name = "repsize" """ # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config["finish_after"]), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config["saveto"], every_n_batches=config["save_freq"]), ] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config["step_clipping"]), eval(config["step_rule"])()]), ) # Reload model if necessary if config["reload"]: extensions.append(LoadNMT(config["saveto"])) # Add sampling if config["hook_samples"] >= 1: logger.info("Building sampler") generated0 = block0.mygenerate(sampling_input0, sampling_hstates0) search_model0 = Model(generated0) generated1 = block1.mygenerate(sampling_input1, sampling_hstates1, sampling_lastrep0) search_model1 = Model(generated1) generated2 = block2.mygenerate(sampling_input2, sampling_hstates2, sampling_lastrep0, sampling_lastrep1) search_model2 = Model(generated2) extensions.append( Sampler( config=config, model0=search_model0, model1=search_model1, model2=search_model2, data_stream=tr_stream, hook_samples=config["hook_samples"], every_n_batches=config["sampling_freq"], src_vocab_size=config["src_vocab_size"], ) ) logger.info("End of building sampler") # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
import argparse import logging import pprint import config from __init__ import main from lexicon import create_dictionary_from_lexicon, create_dictionary_from_punctuation_marks from stream import get_tr_stream, get_dev_stream logger = logging.getLogger(__name__) logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) # Get the arguments parser = argparse.ArgumentParser() parser.add_argument("--proto", default="get_config", help="Prototype config to use for config") parser.add_argument("--bokeh", default=False, action="store_true", help="Use bokeh server for plotting") args = parser.parse_args() if __name__ == "__main__": config = getattr(config, args.proto)() #logger.info("Model options:\n{}".format(pprint.pformat(config))) data_path = "%s/data_global_cmvn_with_phones_alignment_pitch_features.h5" % config["data_dir"] tr_stream = get_tr_stream(data_path, config["src_eos_idx"], config["phones"]["sil"], config["trg_eos_idx"], seq_len=config["seq_len"], batch_size=config["batch_size"], sort_k_batches=config["sort_k_batches"]) dev_stream = get_dev_stream(data_path) main(config, tr_stream, dev_stream, args.bokeh)
samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() if __name__ == "__main__": assert sys.version_info >= (3, 4) # Get configurations for model configuration = configurations.get_config() logger.info("Model options:\n{}".format(pprint.pformat(configuration))) # Get data streams and call main main(configuration, get_tr_stream(**configuration), get_dev_stream(**configuration))