def test_stream():

    # Dummy vocabulary
    vocab = {"<S>": 0, "</S>": 1, "<UNK>": 2}
    with tempfile.NamedTemporaryFile() as src_data:
        with tempfile.NamedTemporaryFile() as trg_data:
            get_tr_stream(src_vocab=vocab, trg_vocab=vocab, src_data=src_data.name, trg_data=trg_data.name)
    with tempfile.NamedTemporaryFile() as val_set:
        get_dev_stream(val_set=val_set.name, src_vocab=vocab)
Beispiel #2
0
    f_init, f_next = trans.build_sample()
    logger.info('end build sample model : f_init, f_next')

    src_vocab = pickle.load(open(config['src_vocab']))
    trg_vocab = pickle.load(open(config['trg_vocab']))
    src_vocab = ensure_special_tokens(src_vocab,
                                      bos_idx=0, eos_idx=config['src_vocab_size'] - 1,
                                      unk_idx=config['unk_id'])
    trg_vocab = ensure_special_tokens(trg_vocab,
                                      bos_idx=0, eos_idx=config['src_vocab_size'] - 1,
                                      unk_idx=config['unk_id'])
    trg_vocab_reverse = {index: word for word, index in trg_vocab.iteritems()}
    src_vocab_reverse = {index: word for word, index in src_vocab.iteritems()}
    logger.info('load dict finished ! src dic size : {} trg dic size : {}.'.format(len(src_vocab), len(trg_vocab)))

    tr_stream = get_tr_stream(**config)
    dev_stream = get_dev_stream(**config)
    logger.info('start training!!!')
    batch_count = 0

    val_time = 0
    best_score = 0.
    for epoch in range(config['max_epoch']):
        for tr_data in tr_stream.get_epoch_iterator():
            batch_count += 1
            tr_fn(*tr_data)

            # sample
            if batch_count % config['sampling_freq'] == 0:
                trans_sample(tr_data[0], tr_data[2], f_init, f_next, config['hook_samples'],
                             src_vocab_reverse, trg_vocab_reverse, batch_count)
def init():
    parser = argparse.ArgumentParser()
    parser.add_argument('--proto', default='get_config_cs2en', help='Prototype config')
    args = parser.parse_args()
    config = getattr(configurations, args.proto)()
    main(config, get_tr_stream(**config), get_dev_stream(**config))
Beispiel #4
0
def main():

    # set para
    config = getattr(configurations, "get_config_cs2en")()
    logger.info("Model options:\n{}".format(pprint.pformat(config)))
    tr_stream = get_tr_stream(**config)

    # Create Theano variables
    logger.info("Creating theano variables")

    source_sentence0 = tensor.lmatrix("source0")
    source_sentence_mask0 = tensor.matrix("source0_mask")
    target_sentence0 = tensor.lmatrix("target0")
    target_sentence_mask0 = tensor.matrix("target0_mask")

    source_sentence1 = tensor.lmatrix("source1")
    source_sentence_mask1 = tensor.matrix("source1_mask")
    target_sentence1 = tensor.lmatrix("target1")
    target_sentence_mask1 = tensor.matrix("target1_mask")

    source_sentence2 = tensor.lmatrix("source2")
    source_sentence_mask2 = tensor.matrix("source2_mask")
    target_sentence2 = tensor.lmatrix("target2")
    target_sentence_mask2 = tensor.matrix("target2_mask")

    sampling_input0 = tensor.lmatrix("input0")
    sampling_input1 = tensor.lmatrix("input1")
    sampling_input2 = tensor.lmatrix("input2")

    sampling_hstates0 = tensor.fmatrix("hstates0")
    sampling_hstates1 = tensor.fmatrix("hstates1")
    sampling_hstates2 = tensor.fmatrix("hstates2")

    sampling_lastrep0 = tensor.tensor3("lastrep0")
    sampling_lastrep1 = tensor.tensor3("lastrep1")

    hstates = theano.shared(value=numpy.zeros((config["enc_nhids"]), dtype=theano.config.floatX), name="hstates")

    # Get vocab
    sources = get_attr_rec(tr_stream, "data_stream")
    src_vocab = sources.data_streams[0].dataset.dictionary
    trg_vocab = sources.data_streams[1].dataset.dictionary

    # Construct model
    logger.info("Building PoemModel")

    block0 = PoemBlock(config=config, blockid="block0", name="poemblock0")
    block1 = PoemBlock(config=config, blockid="block1", name="poemblock1")
    block2 = PoemBlock(config=config, blockid="block2", name="poemblock2")

    cost0, hsta0, rep0 = block0.cost(
        source_sentence0,
        source_sentence_mask0,
        source_sentence_mask1,
        source_sentence_mask0,
        target_sentence0,
        target_sentence_mask0,
        hstates,
        lastrep0=None,
        lastrep1=None,
    )

    cost1, hsta1, rep1 = block1.cost(
        source_sentence1,
        source_sentence_mask0,
        source_sentence_mask1,
        source_sentence_mask1,
        target_sentence1,
        target_sentence_mask1,
        hsta0,
        lastrep0=rep0,
        lastrep1=None,
    )

    cost2, hsta2, rep2 = block2.cost(
        source_sentence2,
        source_sentence_mask0,
        source_sentence_mask1,
        source_sentence_mask2,
        target_sentence2,
        target_sentence_mask2,
        hsta1,
        lastrep0=rep0,
        lastrep1=rep1,
    )

    cost = cost0 + cost1 + cost2
    cost.name = "total_cost"

    logger.info("Creating computational graph")

    cg = ComputationGraph(cost)

    # Initialize model
    logger.info("Initializing model")
    block0.set_initw(IsotropicGaussian(config["weight_scale"]))
    block0.set_initb(Constant(0))
    block0.push_initialization_config()
    block0.set_specialinit(Orthogonal(), Orthogonal())
    block0.initialize()

    block1.set_initw(IsotropicGaussian(config["weight_scale"]))
    block1.set_initb(Constant(0))
    block1.push_initialization_config()
    block1.set_specialinit(Orthogonal(), Orthogonal())
    block1.initialize()

    block2.set_initw(IsotropicGaussian(config["weight_scale"]))
    block2.set_initb(Constant(0))
    block2.push_initialization_config()
    block2.set_specialinit(Orthogonal(), Orthogonal())
    block2.initialize()

    # apply dropout for regularization
    if config["dropout"] < 1.0:
        # dropout is applied to the output of maxout in ghog
        logger.info("Applying dropout")
        dropout_inputs = [x for x in cg.intermediary_variables if x.name == "maxout_apply_output"]
        cg = apply_dropout(cg, dropout_inputs, config["dropout"])

    # Print shapes

    shapes = [param.get_value().shape for param in cg.parameters]
    logger.info("Parameter shapes: ")
    for shape, count in Counter(shapes).most_common():
        logger.info("    {:15}: {}".format(shape, count))
    logger.info("Total number of parameters: {}".format(len(shapes)))

    # Print parameter names

    param_dict = Selector(block0).get_parameters()
    logger.info("Parameter names: ")
    for name, value in param_dict.items():
        logger.info("    {:15}: {}".format(value.get_value().shape, name))
    logger.info("Total number of parameters: {}".format(len(param_dict)))

    # Set up training model
    logger.info("Building model")
    training_model = Model(cost)

    # logger.info(cg.auxiliary_variables)
    # logger.info("______________________________")

    """
    weights = ""
    for va in cg.auxiliary_variables:
        if va.name == "sequence_generator_block0_cost_matrix_weighted_averages":
            weights = va

    weightsize = weights.shape
    weightsize.name = "weightsize"

    states = ""
    for va in cg.auxiliary_variables:
        if va.name == "sequence_generator_block0_cost_matrix_states":
            states = va

    statesize = states.shape
    statesize.name = "statesize"

    rep = ""
    for va in cg.auxiliary_variables:
        if va.name == "poemblock0_cost_block0hstatesRepeat":
            rep = va

    repsize = rep.shape
    repsize.name = "repsize"

    """

    # Set extensions
    logger.info("Initializing extensions")
    extensions = [
        FinishAfter(after_n_batches=config["finish_after"]),
        TrainingDataMonitoring([cost], after_batch=True),
        Printing(after_batch=True),
        CheckpointNMT(config["saveto"], every_n_batches=config["save_freq"]),
    ]

    # Set up training algorithm
    logger.info("Initializing training algorithm")
    algorithm = GradientDescent(
        cost=cost,
        parameters=cg.parameters,
        step_rule=CompositeRule([StepClipping(config["step_clipping"]), eval(config["step_rule"])()]),
    )

    # Reload model if necessary
    if config["reload"]:
        extensions.append(LoadNMT(config["saveto"]))

    # Add sampling

    if config["hook_samples"] >= 1:
        logger.info("Building sampler")

        generated0 = block0.mygenerate(sampling_input0, sampling_hstates0)
        search_model0 = Model(generated0)

        generated1 = block1.mygenerate(sampling_input1, sampling_hstates1, sampling_lastrep0)
        search_model1 = Model(generated1)

        generated2 = block2.mygenerate(sampling_input2, sampling_hstates2, sampling_lastrep0, sampling_lastrep1)
        search_model2 = Model(generated2)

        extensions.append(
            Sampler(
                config=config,
                model0=search_model0,
                model1=search_model1,
                model2=search_model2,
                data_stream=tr_stream,
                hook_samples=config["hook_samples"],
                every_n_batches=config["sampling_freq"],
                src_vocab_size=config["src_vocab_size"],
            )
        )

        logger.info("End of building sampler")

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions)

    # Train!
    main_loop.run()
import argparse
import logging
import pprint

import config

from __init__ import main
from lexicon import create_dictionary_from_lexicon, create_dictionary_from_punctuation_marks
from stream import get_tr_stream, get_dev_stream

logger = logging.getLogger(__name__)
logger.addHandler(logging.StreamHandler())
logger.setLevel(logging.INFO)

# Get the arguments
parser = argparse.ArgumentParser()
parser.add_argument("--proto",  default="get_config", help="Prototype config to use for config")
parser.add_argument("--bokeh",  default=False, action="store_true", help="Use bokeh server for plotting")
args = parser.parse_args()


if __name__ == "__main__":
    config = getattr(config, args.proto)()
    #logger.info("Model options:\n{}".format(pprint.pformat(config)))

    data_path = "%s/data_global_cmvn_with_phones_alignment_pitch_features.h5" % config["data_dir"]
    tr_stream = get_tr_stream(data_path, config["src_eos_idx"], config["phones"]["sil"], config["trg_eos_idx"], seq_len=config["seq_len"], batch_size=config["batch_size"], sort_k_batches=config["sort_k_batches"])
    dev_stream = get_dev_stream(data_path)
    main(config, tr_stream, dev_stream, args.bokeh)
Beispiel #6
0
                          samples=samples,
                          config=config,
                          model=search_model,
                          data_stream=dev_stream,
                          normalize=config['normalized_bleu'],
                          every_n_batches=config['bleu_val_freq']))

    # Reload model if necessary
    if config['reload']:
        extensions.append(LoadNMT(config['saveto']))

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(model=training_model,
                         algorithm=algorithm,
                         data_stream=tr_stream,
                         extensions=extensions)

    # Train!
    main_loop.run()


if __name__ == "__main__":
    assert sys.version_info >= (3, 4)
    # Get configurations for model
    configuration = configurations.get_config()
    logger.info("Model options:\n{}".format(pprint.pformat(configuration)))
    # Get data streams and call main
    main(configuration, get_tr_stream(**configuration),
         get_dev_stream(**configuration))