Esempio n. 1
0
def init_params_envsim(p):

    p = param_init_fflayer(options={},
                           params=p,
                           prefix='es_1',
                           nin=action_size + state_size,
                           nout=nfe,
                           ortho=False,
                           batch_norm=True)
    p = param_init_fflayer(options={},
                           params=p,
                           prefix='es_2',
                           nin=nfe,
                           nout=nfe,
                           ortho=False,
                           batch_norm=True)
    p = param_init_fflayer(options={},
                           params=p,
                           prefix='es_state',
                           nin=nfe,
                           nout=state_size,
                           ortho=False,
                           batch_norm=False)
    p = param_init_fflayer(options={},
                           params=p,
                           prefix='es_reward',
                           nin=nfe,
                           nout=reward_size,
                           ortho=False,
                           batch_norm=False)

    return init_tparams(p)
Esempio n. 2
0
def load_model(
        embed_map=None,
        path_to_model=PATH_TO_MODEL,            # model opts (.pkl)
        path_to_params=PATH_TO_PARAMS,          # model params (.npz)
        path_to_dictionary=PATH_TO_DICTIONARY,
        path_to_word2vec=PATH_TO_WORD2VEC
        ):
    """
    Load all model components + apply vocab expansion
    """
    # Load the worddict
    print 'Loading dictionary...'
    with open(path_to_dictionary, 'rb') as f:
        worddict = pkl.load(f)

    # Create inverted dictionary
    print 'Creating inverted dictionary...'
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # Load model options
    print 'Loading model options...'
    with open(path_to_model, 'rb') as f:
        options = pkl.load(f)

    # Load parameters
    print 'Loading model parameters...'
    params = init_params(options)
    params = load_params(path_to_params, params)
    tparams = init_tparams(params)

    # Extractor functions
    print 'Compiling encoder...'
    trng = RandomStreams(1234)
    trng, x, x_mask, ctx, emb = build_encoder(tparams, options)
    f_enc = theano.function([x, x_mask], ctx, name='f_enc')
    f_emb = theano.function([x], emb, name='f_emb')
    trng, embedding, x_mask, ctxw2v = build_encoder_w2v(tparams, options)
    f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v')

    # Load word2vec, if applicable
    if embed_map == None:
        print 'Loading word2vec embeddings...'
        embed_map = load_googlenews_vectors(path_to_word2vec)

    # Lookup table using vocab expansion trick
    print 'Creating word lookup tables...'
    table = lookup_table(options, embed_map, worddict, word_idict, f_emb)

    # Store everything we need in a dictionary
    print 'Packing up...'
    model = {}
    model['options'] = options
    model['table'] = table
    model['f_w2v'] = f_w2v

    return model
Esempio n. 3
0
def init_params_policy(p):

    p = param_init_fflayer(options={},
                           params=p,
                           prefix='pn_1',
                           nin=state_size,
                           nout=nfp,
                           ortho=False,
                           batch_norm=False)

    p = param_init_fflayer(options={},
                           params=p,
                           prefix='pn_2',
                           nin=nfp,
                           nout=nfp,
                           ortho=False,
                           batch_norm=False)

    p = param_init_fflayer(options={},
                           params=p,
                           prefix='pn_3_mu',
                           nin=nfp,
                           nout=action_size,
                           ortho=False,
                           batch_norm=False)
    p = param_init_fflayer(options={},
                           params=p,
                           prefix='pn_3_sigma',
                           nin=nfp,
                           nout=action_size,
                           ortho=False,
                           batch_norm=False)

    return init_tparams(p)
Esempio n. 4
0
def load_model(embed_map=None, pickle_table=None):
    """
    Load all model components + apply vocab expansion
    """
    # Load the worddict
    print 'Loading dictionary...'
    with open(path_to_dictionary, 'rb') as f:
        worddict = pkl.load(f)

    # Create inverted dictionary
    print 'Creating inverted dictionary...'
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # Load model options
    print 'Loading model options...'
    with open('%s.pkl' % path_to_model, 'rb') as f:
        options = pkl.load(f)

    # Load parameters
    print 'Loading model parameters...'
    params = init_params(options)
    params = load_params(path_to_model, params)
    tparams = init_tparams(params)

    # Extractor functions
    print 'Compiling encoder...'
    trng = RandomStreams(1234)
    trng, x, x_mask, ctx, emb = build_encoder(tparams, options)
    f_enc = theano.function([x, x_mask], ctx, name='f_enc')
    f_emb = theano.function([x], emb, name='f_emb')
    trng, embedding, x_mask, ctxw2v = build_encoder_w2v(tparams, options)
    f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v')

    # Load word2vec, if applicable
    if embed_map == None:
        print 'Loading word2vec embeddings...'
        embed_map = load_googlenews_vectors(path_to_word2vec)

    # Lookup table using vocab expansion trick
    if pickle_table:
        t = numpy.load(pickle_table)
        table = OrderedDict()
        for k, v in t:
            table[k] = v
    else:
        print 'Creating word lookup tables...'
        table = lookup_table(options, embed_map, worddict, word_idict, f_emb)

    # Store everything we need in a dictionary
    print 'Packing up...'
    model = {}
    model['options'] = options
    model['table'] = table
    model['f_w2v'] = f_w2v

    return model
Esempio n. 5
0
def init_params():
    params = {}

    params = param_init_gru({}, params, prefix='gru', nin=1, dim=512)

    #params = param_init_fflayer({},params,prefix='ff_0',nin=512+1,nout=512,batch_norm=False)
    params = param_init_fflayer({},
                                params,
                                prefix='ff_1',
                                nin=512,
                                nout=512,
                                batch_norm=False)
    params = param_init_fflayer({},
                                params,
                                prefix='ff_2',
                                nin=512,
                                nout=512,
                                batch_norm=False)
    params = param_init_fflayer({},
                                params,
                                prefix='ff_3',
                                nin=512,
                                nout=1,
                                batch_norm=False)

    return init_tparams(params)
Esempio n. 6
0
def load_model(
        path_to_model='/home/shunan/Code/skip-thoughts/experiments/amazon/amazon_model_bi.npz',
        path_to_dictionary='/home/shunan/Code/skip-thoughts/experiments/amazon/word_dicts.pkl',
        embed_map=None):
    """
    Load all model components + apply vocab expansion
    """
    # Load the worddict
    print 'Loading dictionary...'
    with open(path_to_dictionary, 'rb') as f:
        worddict = pkl.load(f)

    # Create inverted dictionary
    print 'Creating inverted dictionary...'
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # Load model options
    print 'Loading model options...'
    with open('%s.pkl' % path_to_model, 'rb') as f:
        options = pkl.load(f)

    # Load parameters
    print 'Loading model parameters...'
    params = init_params(options)
    params = load_params(path_to_model, params)
    tparams = init_tparams(params)

    # Extractor functions
    print 'Compiling encoder...'
    trng = RandomStreams(1234)
    trng, x, x_mask, ctx, emb = build_encoder(tparams, options)
    f_enc = theano.function([x, x_mask], ctx, name='f_enc')
    f_emb = theano.function([x], emb, name='f_emb')
    trng, embedding, x_mask, ctxw2v = build_encoder_w2v(tparams, options)
    f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v')

    # Load word2vec, if applicable
    # if embed_map == None:
    #     print 'Loading word2vec embeddings...'
    #     embed_map = load_googlenews_vectors(path_to_word2vec)

    # Lookup table using vocab expansion trick
    print 'Creating word lookup tables...'
    table = lookup_table(options, embed_map, worddict, word_idict, f_emb)

    # Store everything we need in a dictionary
    print 'Packing up...'
    model = {}
    model['options'] = options
    model['table'] = table
    model['f_w2v'] = f_w2v

    # model is just a dict.
    return model
Esempio n. 7
0
    def __init__(self, num_hidden, num_features, seq_length, mb_size, tf_states, rf_states):
        
        tf_states = T.specify_shape(tf_states, (seq_length, mb_size, num_features))
        rf_states = T.specify_shape(rf_states, (seq_length, mb_size, num_features))

        hidden_state_features = T.specify_shape(T.concatenate([tf_states, rf_states], axis = 1), (seq_length, mb_size * 2, num_features))

        gru_params_1 = init_tparams(param_init_gru(None, {}, prefix = "gru1", dim = num_hidden, nin = num_features))
        #gru_params_2 = init_tparams(param_init_gru(None, {}, prefix = "gru2", dim = num_hidden, nin = num_hidden + num_features))
        #gru_params_3 = init_tparams(param_init_gru(None, {}, prefix = "gru3", dim = num_hidden, nin = num_hidden + num_features))

        gru_1_out = gru_layer(gru_params_1, hidden_state_features, None, prefix = 'gru1')[0]
        #gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis = 2), None, prefix = 'gru2', backwards = True)[0]
        #gru_3_out = gru_layer(gru_params_3, T.concatenate([gru_2_out, hidden_state_features], axis = 2), None, prefix = 'gru3')[0]

        final_out_recc = T.specify_shape(T.mean(gru_1_out, axis = 0), (mb_size * 2, num_hidden))

        h_out_1 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        #h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        #h_out_3 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        h_out_4 = DenseLayer((mb_size * 2, num_hidden), num_units = 1, nonlinearity=None)

        h_out_1_value = h_out_1.get_output_for(final_out_recc)
        h_out_4_value = h_out_4.get_output_for(h_out_1_value)

        raw_y = h_out_4_value
        #raw_y = T.clip(h_out_4_value, -10.0, 10.0)
        classification = T.nnet.sigmoid(raw_y)

        #tf comes before rf.  
        p_real =  classification[:mb_size]
        p_gen  = classification[mb_size:]

        #bce = lambda r,t: t * T.nnet.softplus(-r) + (1 - t) * (r + T.nnet.softplus(-r))

        self.d_cost_real = bce(p_real, 0.9 * T.ones(p_real.shape)).mean()
        self.d_cost_gen = bce(p_gen, 0.1 + T.zeros(p_gen.shape)).mean()
        self.g_cost_d = bce(p_gen, 0.9 * T.ones(p_gen.shape)).mean()
        self.d_cost = self.d_cost_real + self.d_cost_gen
        self.g_cost = self.g_cost_d


        self.classification = classification

        self.params = []
        self.params += lasagne.layers.get_all_params(h_out_4,trainable=True)
        #self.params += lasagne.layers.get_all_params(h_out_3,trainable=True)
        #self.params += lasagne.layers.get_all_params(h_out_2,trainable=True)
        self.params += lasagne.layers.get_all_params(h_out_1,trainable=True)

        self.params += gru_params_1.values()
        #self.params += gru_params_2.values()
        #self.params += gru_params_3.values()

        self.accuracy = T.mean(T.eq(T.ones(p_real.shape).flatten(), T.gt(p_real, 0.5).flatten())) + T.mean(T.eq(T.ones(p_gen.shape).flatten(), T.lt(p_gen, 0.5).flatten()))
def load_model(path_to_model=default_model):
    """
    Load all model components
    """
    print path_to_model

    # Load the worddict
    print 'Loading dictionary...'
    with open(path_to_model + '.dictionary.pkl', 'rb') as f:
        worddict = pkl.load(f)

    # Create inverted dictionary
    print 'Creating inverted dictionary...'
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # Load model options
    print 'Loading model options...'
    with open(path_to_model + '.pkl', 'rb') as f:
        options = pkl.load(f)

    # Load parameters
    print 'Loading model parameters...'
    params = init_params(options)
    params = load_params(path_to_model, params)
    tparams = init_tparams(params)

    # Extractor functions

    print 'Compiling image encoder...'
    trng, [im], images = build_image_encoder(tparams, options)
    f_ienc = theano.function([im], images, name='f_ienc')

    print 'Compiling sentence encoder...'
    trng = RandomStreams(1234)
    trng, [x, x_mask], sentences = build_sentence_encoder(tparams, options)
    f_senc = theano.function([x, x_mask], sentences, name='f_senc')

    # Store everything we need in a dictionary
    print 'Packing up...'
    model = {}
    model['options'] = options
    model['worddict'] = worddict
    model['word_idict'] = word_idict
    model['f_senc'] = f_senc
    model['f_ienc'] = f_ienc
    return model
Esempio n. 9
0
def load_model():
    """
    Load all model components
    """
    print path_to_model

    # Load the worddict
    print "Loading dictionary..."
    with open("%s.dictionary.pkl" % path_to_model, "rb") as f:
        worddict = pkl.load(f)

    # Create inverted dictionary
    print "Creating inverted dictionary..."
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = "<eos>"
    word_idict[1] = "UNK"

    # Load model options
    print "Loading model options..."
    with open("%s.pkl" % path_to_model, "rb") as f:
        options = pkl.load(f)

    # Load parameters
    print "Loading model parameters..."
    params = init_params(options)
    params = load_params(path_to_model, params)
    tparams = init_tparams(params)

    # Extractor functions
    print "Compiling sentence encoder..."
    trng = RandomStreams(1234)
    trng, [x, x_mask], sentences = build_sentence_encoder(tparams, options)
    f_senc = theano.function([x, x_mask], sentences, name="f_senc")

    print "Compiling image encoder..."
    trng, [im], images = build_image_encoder(tparams, options)
    f_ienc = theano.function([im], images, name="f_ienc")

    # Store everything we need in a dictionary
    print "Packing up..."
    model = {}
    model["options"] = options
    model["worddict"] = worddict
    model["word_idict"] = word_idict
    model["f_senc"] = f_senc
    model["f_ienc"] = f_ienc
    return model
Esempio n. 10
0
def load_model(path_to_model=default_model):
    """
    Load all model components
    """
    print path_to_model

    # Load the worddict
    print 'Loading dictionary...'
    with open('%s.dictionary.pkl'%path_to_model, 'rb') as f:
        worddict = pkl.load(f)

    # Create inverted dictionary
    print 'Creating inverted dictionary...'
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # Load model options
    print 'Loading model options...'
    with open('%s.pkl'%path_to_model, 'rb') as f:
        options = pkl.load(f)

    # Load parameters
    print 'Loading model parameters...'
    params = init_params(options)
    params = load_params(path_to_model, params)
    tparams = init_tparams(params)

    # Extractor functions
    print 'Compiling sentence encoder...'
    trng = RandomStreams(1234)
    trng, [x, x_mask], sentences = build_sentence_encoder(tparams, options)
    f_senc = theano.function([x, x_mask], sentences, name='f_senc')

    print 'Compiling image encoder...'
    trng, [im], images = build_image_encoder(tparams, options)
    f_ienc = theano.function([im], images, name='f_ienc')

    # Store everything we need in a dictionary
    print 'Packing up...'
    model = {}
    model['options'] = options
    model['worddict'] = worddict
    model['word_idict'] = word_idict
    model['f_senc'] = f_senc
    model['f_ienc'] = f_ienc
    return model
Esempio n. 11
0
def load_model(
        path_to_model=PATH_TO_MODEL,            # model opts (.pkl)
        path_to_params=PATH_TO_PARAMS,          # model params (.npz)
        path_to_dictionary=PATH_TO_DICTIONARY
        ):
    """
    Load a trained model for decoding
    """
    # Load the worddict
    print 'Loading dictionary...'
    with open(path_to_dictionary, 'rb') as f:
        worddict = pkl.load(f)

    # Create inverted dictionary
    print 'Creating inverted dictionary...'
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # Load model options
    print 'Loading model options...'
    with open('%s.pkl'%path_to_model, 'rb') as f:
        options = pkl.load(f)

    # Load parameters
    print 'Loading model parameters...'
    params = init_params(options)
    params = load_params(path_to_params, params)
    tparams = init_tparams(params)

    # Sampler.
    trng = RandomStreams(1234)
    f_init, f_next = build_sampler(tparams, options, trng)

    # Pack everything up
    dec = dict()
    dec['options'] = options
    dec['trng'] = trng
    dec['worddict'] = worddict
    dec['word_idict'] = word_idict
    dec['tparams'] = tparams
    dec['f_init'] = f_init
    dec['f_next'] = f_next
    return dec
Esempio n. 12
0
def load_model(
        path_to_model=PATH_TO_MODEL,  # model opts (.pkl)
        path_to_params=PATH_TO_PARAMS,  # model params (.npz)
        path_to_dictionary=PATH_TO_DICTIONARY):
    """
    Load a trained model for decoding
    """
    # Load the worddict
    print 'Loading dictionary...'
    with open(path_to_dictionary, 'rb') as f:
        worddict = pkl.load(f)

    # Create inverted dictionary
    print 'Creating inverted dictionary...'
    word_idict = dict()
    for kk, vv in worddict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # Load model options
    print 'Loading model options...'
    with open('%s.pkl' % path_to_model, 'rb') as f:
        options = pkl.load(f)

    # Load parameters
    print 'Loading model parameters...'
    params = init_params(options)
    params = load_params(path_to_params, params)
    tparams = init_tparams(params)

    # Sampler.
    trng = RandomStreams(1234)
    f_init, f_next = build_sampler(tparams, options, trng)

    # Pack everything up
    dec = dict()
    dec['options'] = options
    dec['trng'] = trng
    dec['worddict'] = worddict
    dec['word_idict'] = word_idict
    dec['tparams'] = tparams
    dec['f_init'] = f_init
    dec['f_next'] = f_next
    return dec
Esempio n. 13
0
def init_params_forward():

    p = {}

    param_init_lngru({}, params=p, prefix='gru1', nin=1024, dim=1024)

    tparams = init_tparams(p)

    tparams['W1'] = theano.shared(
        0.03 * rng.normal(0, 1, size=(1, 1024, 1024)).astype('float32'))
    tparams['W2'] = theano.shared(
        0.03 * rng.normal(0, 1, size=(1, 1024, 1024)).astype('float32'))
    tparams['Wy'] = theano.shared(
        0.03 * rng.normal(0, 1, size=(1, 1024, 11)).astype('float32'))
    tparams['W0'] = theano.shared(
        0.03 * rng.normal(0, 1, size=(1024 + nf, 1024)).astype('float32'))

    return tparams
Esempio n. 14
0
def load_model(save_dir, model_name, best=True):
    """
    Load all model components
    Input are only save_dir and model_name since it is assumed that filenames keep convention
    """

    model_options = {}
    model_options['save_dir'] = save_dir
    model_options['model_name'] = model_name

    # Load model
    print 'Loading model'
    opt_filename_reload = get_opt_filename(model_options, previous=True)
    with open(opt_filename_reload, 'rb') as f:
        model = pkl.load(f)

    options = model['options']

    # Load parameters
    print 'Loading model parameters...'
    params = init_params(options)
    params_filename = get_npz_filename(model_options, best=best, previous=True)
    params = load_params(params_filename, params)
    tparams = init_tparams(params)

    # Extractor functions
    print 'Compiling sentence encoder...'
    [x, x_mask], sentences = build_sentence_encoder(tparams, options)
    f_senc = theano.function([x, x_mask], sentences, name='f_senc')

    print 'Compiling image encoder...'
    [im], images = build_image_encoder(tparams, options)
    f_ienc = theano.function([im], images, name='f_ienc')

    print 'Compiling error computation...'
    [s, im], errs = build_errors(options)
    f_err = theano.function([s, im], errs, name='f_err')

    # Store everything we need in a dictionary
    print 'Packing up...'
    model['f_senc'] = f_senc
    model['f_ienc'] = f_ienc
    model['f_err'] = f_err
    return model
Esempio n. 15
0
def load_model_path(path_to_model):
    """
    Load all model components
    """
    print path_to_model

    # Load model
    print 'Loading model'
    with open(path_to_model + '.pkl', 'rb') as f:
        model = pkl.load(f)

    options = model['options']

    # Load parameters
    print 'Loading model parameters...'
    params = init_params(options)

    params = load_params(path_to_model + '.npz', params)
    tparams = init_tparams(params)

    # Extractor functions
    print 'Compiling sentence encoder...'
    [x, x_mask], sentences = build_sentence_encoder(tparams, options)
    f_senc = theano.function([x, x_mask], sentences, name='f_senc')

    print 'Compiling image encoder...'
    [im], images = build_image_encoder(tparams, options)
    f_ienc = theano.function([im], images, name='f_ienc')

    print 'Compiling error computation...'
    [s, im], errs = build_errors(options)
    f_err = theano.function([s, im], errs, name='f_err')

    # Store everything we need in a dictionary
    print 'Packing up...'
    model['f_senc'] = f_senc
    model['f_ienc'] = f_ienc
    model['f_err'] = f_err
    return model
Esempio n. 16
0
def load_model(path_to_model):
    """
    Load all model components
    """
    print path_to_model

    # Load model
    print 'Loading model'
    with open(path_to_model + '.pkl', 'rb') as f:
        model = pkl.load(f)

    options = model['options']

    # Load parameters
    print 'Loading model parameters...'
    params = init_params(options)
    params = load_params(path_to_model + '.npz', params)
    tparams = init_tparams(params)

    # Extractor functions
    print 'Compiling sentence encoder...'
    [x, x_mask], sentences = build_sentence_encoder(tparams, options)
    f_senc = theano.function([x, x_mask], sentences, name='f_senc')

    print 'Compiling image encoder...'
    [im], images = build_image_encoder(tparams, options)
    f_ienc = theano.function([im], images, name='f_ienc')

    print 'Compiling error computation...'
    [s, im], errs = build_errors(options)
    f_err = theano.function([s,im], errs, name='f_err')

    # Store everything we need in a dictionary
    print 'Packing up...'
    model['f_senc'] = f_senc
    model['f_ienc'] = f_ienc
    model['f_err'] = f_err
    return model
def train(
        random_seed=1234,
        dim_word=256,  # word vector dimensionality
        ctx_dim=-1,  # context vector dimensionality, auto set
        dim=1000,  # the number of LSTM units
        n_layers_out=1,
        n_layers_init=1,
        encoder='none',
        encoder_dim=100,
        prev2out=False,
        ctx2out=False,
        patience=10,
        max_epochs=5000,
        dispFreq=100,
        decay_c=0.,
        alpha_c=0.,
        alpha_entropy_r=0.,
        lrate=0.01,
        selector=False,
        n_words=100000,
        maxlen=100,  # maximum length of the description
        optimizer='adadelta',
        clip_c=2.,
        batch_size=64,
        valid_batch_size=64,
        save_model_dir='/data/lisatmp3/yaoli/exp/capgen_vid/attention/test/',
        validFreq=10,
        saveFreq=10,  # save the parameters after every saveFreq updates
        sampleFreq=10,  # generate some samples after every sampleFreq updates
        metric='blue',
        dataset='youtube2text',
        video_feature='googlenet',
        use_dropout=False,
        reload_=False,
        from_dir=None,
        K=10,
        OutOf=240,
        verbose=True,
        debug=True):
    rng_numpy, rng_theano = utils.get_two_rngs()

    model_options = locals().copy()
    if 'self' in model_options:
        del model_options['self']
    with open('%smodel_options.pkl' % save_model_dir, 'wb') as f:
        pkl.dump(model_options, f)

    # instance model
    layers = Layers()
    model = Model()

    print 'Loading data'
    engine = data_engine.Movie2Caption('attention', dataset, video_feature,
                                       batch_size, valid_batch_size, maxlen,
                                       n_words, K, OutOf)
    model_options['ctx_dim'] = engine.ctx_dim
    model_options['n_words'] = engine.n_words
    print 'n_words:', model_options['n_words']

    # set test values, for debugging
    idx = engine.kf_train[0]
    [x_tv, mask_tv, ctx_tv, ctx_mask_tv
     ] = data_engine.prepare_data(engine,
                                  [engine.train[index] for index in idx])

    print 'init params'
    t0 = time.time()
    params = model.init_params(model_options)

    # reloading
    if reload_:
        model_saved = from_dir + '/model_best_so_far.npz'
        assert os.path.isfile(model_saved)
        print "Reloading model params..."
        params = utils.load_params(model_saved, params)

    tparams = utils.init_tparams(params)

    trng, use_noise, \
          x, mask, ctx, mask_ctx, \
          cost, extra = \
          model.build_model(tparams, model_options)
    alphas = extra[1]
    betas = extra[2]
    print 'buliding sampler'
    f_init, f_next = model.build_sampler(tparams, model_options, use_noise,
                                         trng)
    # before any regularizer
    print 'building f_log_probs'
    f_log_probs = theano.function([x, mask, ctx, mask_ctx],
                                  -cost,
                                  profile=False,
                                  on_unused_input='ignore')

    cost = cost.mean()
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    if alpha_c > 0.:
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * ((1. - alphas.sum(0))**2).sum(-1).mean()
        cost += alpha_reg

    if alpha_entropy_r > 0:
        alpha_entropy_r = theano.shared(numpy.float32(alpha_entropy_r),
                                        name='alpha_entropy_r')
        alpha_reg_2 = alpha_entropy_r * (-tensor.sum(
            alphas * tensor.log(alphas + 1e-8), axis=-1)).sum(-1).mean()
        cost += alpha_reg_2
    else:
        alpha_reg_2 = tensor.zeros_like(cost)
    print 'building f_alpha'
    f_alpha = theano.function([x, mask, ctx, mask_ctx], [alphas, betas],
                              name='f_alpha',
                              on_unused_input='ignore')

    print 'compute grad'
    grads = tensor.grad(cost, wrt=utils.itemlist(tparams))
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    lr = tensor.scalar(name='lr')
    print 'build train fns'
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads,
                                              [x, mask, ctx, mask_ctx], cost,
                                              extra + grads)

    print 'compilation took %.4f sec' % (time.time() - t0)
    print 'Optimization'

    history_errs = []
    # reload history
    if reload_:
        print 'loading history error...'
        history_errs = numpy.load(
            from_dir + 'model_best_so_far.npz')['history_errs'].tolist()

    bad_counter = 0

    processes = None
    queue = None
    rqueue = None
    shared_params = None

    uidx = 0
    uidx_best_blue = 0
    uidx_best_valid_err = 0
    estop = False
    best_p = utils.unzip(tparams)
    best_blue_valid = 0
    best_valid_err = 999
    alphas_ratio = []
    for eidx in xrange(max_epochs):
        n_samples = 0
        train_costs = []
        grads_record = []
        print 'Epoch ', eidx
        for idx in engine.kf_train:
            tags = [engine.train[index] for index in idx]
            n_samples += len(tags)
            uidx += 1
            use_noise.set_value(1.)

            pd_start = time.time()
            x, mask, ctx, ctx_mask = data_engine.prepare_data(engine, tags)
            pd_duration = time.time() - pd_start
            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                continue

            ud_start = time.time()
            rvals = f_grad_shared(x, mask, ctx, ctx_mask)
            cost = rvals[0]
            probs = rvals[1]
            alphas = rvals[2]
            betas = rvals[3]
            grads = rvals[4:]
            grads, NaN_keys = utils.grad_nan_report(grads, tparams)
            if len(grads_record) >= 5:
                del grads_record[0]
            grads_record.append(grads)
            if NaN_keys != []:
                print 'grads contain NaN'
                import pdb
                pdb.set_trace()
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected in cost'
                import pdb
                pdb.set_trace()
            # update params
            f_update(lrate)
            ud_duration = time.time() - ud_start

            if eidx == 0:
                train_error = cost
            else:
                train_error = train_error * 0.95 + cost * 0.05
            train_costs.append(cost)

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Train cost mean so far', \
                  train_error, 'fetching data time spent (sec)', pd_duration, \
                  'update time spent (sec)', ud_duration, 'save_dir', save_model_dir
                alphas, betas = f_alpha(x, mask, ctx, ctx_mask)
                counts = mask.sum(0)
                betas_mean = (betas * mask).sum(0) / counts
                betas_mean = betas_mean.mean()
                print 'alpha ratio %.3f, betas mean %.3f' % (
                    alphas.min(-1).mean() /
                    (alphas.max(-1)).mean(), betas_mean)
                l = 0
                for vv in x[:, 0]:
                    if vv == 0:
                        break
                    if vv in engine.word_idict:
                        print '(', numpy.round(betas[l, 0],
                                               3), ')', engine.word_idict[vv],
                    else:
                        print '(', numpy.round(betas[l, 0], 3), ')', 'UNK',
                    l += 1
                print '(', numpy.round(betas[l, 0], 3), ')'

            if numpy.mod(uidx, saveFreq) == 0:
                pass

            if numpy.mod(uidx, sampleFreq) == 0:
                use_noise.set_value(0.)
                print '------------- sampling from train ----------'
                x_s = x
                mask_s = mask
                ctx_s = ctx
                ctx_mask_s = ctx_mask
                model.sample_execute(engine, model_options, tparams, f_init,
                                     f_next, x_s, ctx_s, ctx_mask_s, trng)
                print '------------- sampling from valid ----------'
                idx = engine.kf_valid[numpy.random.randint(
                    1,
                    len(engine.kf_valid) - 1)]
                tags = [engine.valid[index] for index in idx]
                x_s, mask_s, ctx_s, mask_ctx_s = data_engine.prepare_data(
                    engine, tags)
                model.sample_execute(engine, model_options, tparams, f_init,
                                     f_next, x_s, ctx_s, mask_ctx_s, trng)

            if validFreq != -1 and numpy.mod(uidx, validFreq) == 0:
                t0_valid = time.time()
                alphas, _ = f_alpha(x, mask, ctx, ctx_mask)
                ratio = alphas.min(-1).mean() / (alphas.max(-1)).mean()
                alphas_ratio.append(ratio)
                numpy.savetxt(save_model_dir + 'alpha_ratio.txt', alphas_ratio)

                current_params = utils.unzip(tparams)
                numpy.savez(save_model_dir + 'model_current.npz',
                            history_errs=history_errs,
                            **current_params)

                use_noise.set_value(0.)
                train_err = -1
                train_perp = -1
                valid_err = -1
                valid_perp = -1
                test_err = -1
                test_perp = -1
                if not debug:
                    # first compute train cost
                    if 0:
                        print 'computing cost on trainset'
                        train_err, train_perp = model.pred_probs(
                            engine,
                            'train',
                            f_log_probs,
                            verbose=model_options['verbose'])
                    else:
                        train_err = 0.
                        train_perp = 0.
                    if 1:
                        print 'validating...'
                        valid_err, valid_perp = model.pred_probs(
                            engine,
                            'valid',
                            f_log_probs,
                            verbose=model_options['verbose'],
                        )
                    else:
                        valid_err = 0.
                        valid_perp = 0.
                    if 1:
                        print 'testing...'
                        test_err, test_perp = model.pred_probs(
                            engine,
                            'test',
                            f_log_probs,
                            verbose=model_options['verbose'])
                    else:
                        test_err = 0.
                        test_perp = 0.

                mean_ranking = 0
                blue_t0 = time.time()
                scores, processes, queue, rqueue, shared_params = \
                    metrics.compute_score(
                    model_type='attention',
                    model_archive=current_params,
                    options=model_options,
                    engine=engine,
                    save_dir=save_model_dir,
                    beam=5, n_process=5,
                    whichset='both',
                    on_cpu=False,
                    processes=processes, queue=queue, rqueue=rqueue,
                    shared_params=shared_params, metric=metric,
                    one_time=False,
                    f_init=f_init, f_next=f_next, model=model
                    )
                '''
                 {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]},
                 'alternative_valid': {'Bleu_3': 0.40702270203174923,
                 'Bleu_4': 0.29276570520368456,
                 'CIDEr': 0.25247168210607884,
                 'Bleu_2': 0.529069629270047,
                 'Bleu_1': 0.6804308797115253,
                 'ROUGE_L': 0.51083584331688392},
                 'meteor': {'test': [-1], 'valid': [0.282787550236724]}}
                '''

                valid_B1 = scores['valid']['Bleu_1']
                valid_B2 = scores['valid']['Bleu_2']
                valid_B3 = scores['valid']['Bleu_3']
                valid_B4 = scores['valid']['Bleu_4']
                valid_Rouge = scores['valid']['ROUGE_L']
                valid_Cider = scores['valid']['CIDEr']
                valid_meteor = scores['valid']['METEOR']
                test_B1 = scores['test']['Bleu_1']
                test_B2 = scores['test']['Bleu_2']
                test_B3 = scores['test']['Bleu_3']
                test_B4 = scores['test']['Bleu_4']
                test_Rouge = scores['test']['ROUGE_L']
                test_Cider = scores['test']['CIDEr']
                test_meteor = scores['test']['METEOR']
                print 'computing meteor/blue score used %.4f sec, '\
                  'blue score: %.1f, meteor score: %.1f'%(
                time.time()-blue_t0, valid_B4, valid_meteor)
                history_errs.append([
                    eidx, uidx, train_err, train_perp, valid_perp, test_perp,
                    valid_err, test_err, valid_B1, valid_B2, valid_B3,
                    valid_B4, valid_meteor, valid_Rouge, valid_Cider, test_B1,
                    test_B2, test_B3, test_B4, test_meteor, test_Rouge,
                    test_Cider
                ])
                numpy.savetxt(save_model_dir + 'train_valid_test.txt',
                              history_errs,
                              fmt='%.3f')
                print 'save validation results to %s' % save_model_dir
                # save best model according to the best blue or meteor
                if len(history_errs) > 1 and \
                  valid_B4 > numpy.array(history_errs)[:-1,11].max():
                    print 'Saving to %s...' % save_model_dir,
                    numpy.savez(save_model_dir +
                                'model_best_blue_or_meteor.npz',
                                history_errs=history_errs,
                                **best_p)
                if len(history_errs) > 1 and \
                  valid_err < numpy.array(history_errs)[:-1,6].min():
                    best_p = utils.unzip(tparams)
                    bad_counter = 0
                    best_valid_err = valid_err
                    uidx_best_valid_err = uidx

                    print 'Saving to %s...' % save_model_dir,
                    numpy.savez(save_model_dir + 'model_best_so_far.npz',
                                history_errs=history_errs,
                                **best_p)
                    with open('%smodel_options.pkl' % save_model_dir,
                              'wb') as f:
                        pkl.dump(model_options, f)
                    print 'Done'
                elif len(history_errs) > 1 and \
                    valid_err >= numpy.array(history_errs)[:-1,6].min():
                    bad_counter += 1
                    print 'history best ', numpy.array(history_errs)[:,
                                                                     6].min()
                    print 'bad_counter ', bad_counter
                    print 'patience ', patience
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if test_B4 > 0.52 and test_meteor > 0.32:
                    print 'Saving to %s...' % save_model_dir,
                    numpy.savez(save_model_dir + 'model_' + str(uidx) + '.npz',
                                history_errs=history_errs,
                                **current_params)

                print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \
                  'best valid err so far',best_valid_err
                print 'valid took %.2f sec' % (time.time() - t0_valid)
                # end of validatioin
            if debug:
                break
        if estop:
            break
        if debug:
            break

        # end for loop over minibatches
        print 'This epoch has seen %d samples, train cost %.2f' % (
            n_samples, numpy.mean(train_costs))
    # end for loop over epochs
    print 'Optimization ended.'
    if best_p is not None:
        utils.zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = 0
    test_err = 0
    if not debug:
        #if valid:
        valid_err, valid_perp = model.pred_probs(
            engine, 'valid', f_log_probs, verbose=model_options['verbose'])
        #if test:
        #test_err, test_perp = self.pred_probs(
        #    'test', f_log_probs,
        #    verbose=model_options['verbose'])


    print 'stopped at epoch %d, minibatch %d, '\
      'curent Train %.2f, current Valid %.2f, current Test %.2f '%(
          eidx,uidx,numpy.mean(train_err),numpy.mean(valid_err),numpy.mean(test_err))
    params = copy.copy(best_p)
    numpy.savez(save_model_dir + 'model_best.npz',
                train_err=train_err,
                valid_err=valid_err,
                test_err=test_err,
                history_errs=history_errs,
                **params)

    if history_errs != []:
        history = numpy.asarray(history_errs)
        best_valid_idx = history[:, 6].argmin()
        numpy.savetxt(save_model_dir + 'train_valid_test.txt',
                      history,
                      fmt='%.4f')
        print 'final best exp ', history[best_valid_idx]

    return train_err, valid_err, test_err
Esempio n. 18
0
def load_model(path_to_model=None,
               path_to_model_options=None,
               path_to_dictionaries=None,
               verbose=True):
    """
    Load all model components
    """
    if verbose:
        print path_to_model

    if path_to_model_options is None:
        path_to_model_options = "%s.pkl" % path_to_model

    # Load model options
    if verbose:
        print 'Loading model options...'

    with open(path_to_model_options, 'rb') as f:
        options = pkl.load(f)

    langs = options['langs']
    worddicts = []
    iworddicts = []
    if verbose:
        print 'Loading dictionaries and creating inverted dictionaries...'

    path_to_dictionary = []
    if path_to_dictionaries is None:
        for lang in langs:
            path_to_dictionary.append("%s.dictionary-%s.pkl" %
                                      (path_to_model, lang))
    else:
        path_to_dictionary = path_to_dictionaries

    for lang_idx, lang in enumerate(langs):
        # Load the worddict
        with open(path_to_dictionary[lang_idx], 'rb') as f:
            worddict_lang = pkl.load(f)

        # Create inverted dictionaries
        word_idict_lang = dict()
        for kk, vv in worddict_lang.iteritems():
            word_idict_lang[vv] = kk
        word_idict_lang[0] = '<eos>'
        word_idict_lang[1] = 'UNK'

        worddicts.append(worddict_lang)
        iworddicts.append(word_idict_lang)

    # Load parameters
    if verbose:
        print 'Loading model parameters...'
    params = init_params(options)

    params = load_params(path_to_model, params)
    tparams = init_tparams(params)

    # Extractor functions
    if verbose:
        print 'Compiling sentence encoders...'
    trng = RandomStreams(1234)
    trng, in_outs = build_sentence_encoders(tparams, options)
    #trng, [xs, xmasks, sentences_all] = build_sentence_encoders(tparams, options)
    f_senc_all = []
    for i, lang in enumerate(langs):
        #x, xmask, sents = xs[i], xmasks[i], sentences_all[i]
        [x, xmask], sents = in_outs[i]
        f_senc_lang = theano.function([x, xmask],
                                      sents,
                                      name='f_senc_%s' % lang)
        f_senc_all.append(f_senc_lang)

    if verbose:
        print 'Compiling image encoder...'
    trng, [im], images = build_image_encoder(tparams, options)
    f_ienc = theano.function([im], images, name='f_ienc')

    # Store everything we need in a dictionary
    if verbose:
        print 'Packing up...'
    model = {}
    model['options'] = options
    model['f_ienc'] = f_ienc
    # TODO fix this tparams/params everywhere else in the rest of the code
    model['tparams'] = params

    for lang_idx, lang in enumerate(langs):
        model['worddict_%s' % lang] = worddicts[lang_idx]
        model['iworddict_%s' % lang] = iworddicts[lang_idx]
        model['f_senc_%s' % lang] = f_senc_all[lang_idx]
    return model
def train(
        dim_word_desc=400,  # word vector dimensionality
        dim_word_q=400,
        dim_word_ans=600,
        dim_proj=300,
        dim=400,  # the number of LSTM units
        encoder_desc='lstm',
        encoder_desc_word='lstm',
        encoder_desc_sent='lstm',
        use_dq_sims=False,
        eyem=None,
        learn_h0=False,
        use_desc_skip_c_g=False,
        debug=False,
        encoder_q='lstm',
        patience=10,
        max_epochs=5000,
        dispFreq=100,
        decay_c=0.,
        alpha_c=0.,
        clip_c=-1.,
        lrate=0.01,
        n_words_q=49145,
        n_words_desc=115425,
        n_words_ans=409,
        pkl_train_files=None,
        pkl_valid_files=None,
        maxlen=2000,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=2,
        vocab=None,
        valid_batch_size=16,
        use_elu_g=False,
        saveto='model.npz',
        model_dir=None,
        ms_nlayers=3,
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        datasets=[None],
        truncate=400,
        momentum=0.9,
        use_bidir=False,
        cost_mask=None,
        valid_datasets=[
            '/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5',
            '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5'
        ],
        dropout_rate=0.5,
        use_dropout=True,
        reload_=True,
        **opt_ds):

    ensure_dir_exists(model_dir)
    mpath = os.path.join(model_dir, saveto)
    mpath_best = os.path.join(model_dir, prfx("best", saveto))
    mpath_last = os.path.join(model_dir, prfx("last", saveto))
    mpath_stats = os.path.join(model_dir, prfx("stats", saveto))

    # Model options
    model_options = locals().copy()
    model_options['use_sent_reps'] = opt_ds['use_sent_reps']
    stats = defaultdict(list)

    del model_options['eyem']
    del model_options['cost_mask']

    if cost_mask is not None:
        cost_mask = sharedX(cost_mask)

    # reload options and parameters
    if reload_:
        print "Reloading the model."
        if os.path.exists(mpath_best):
            print "Reloading the best model from %s." % mpath_best
            with open(os.path.join(mpath_best, '%s.pkl' % mpath_best),
                      'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath_best, params)
        elif os.path.exists(mpath):
            print "Reloading the model from %s." % mpath
            with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath, params)
        else:
            raise IOError("Couldn't open the file.")
    else:
        print "Couldn't reload the models initializing from scratch."
        params = init_params(model_options)

    if datasets[0]:
        print "Short dataset", datasets[0]

    print 'Loading data'
    print 'Building model'
    if pkl_train_files is None or pkl_valid_files is None:
        train, valid, test = load_data(path=datasets[0],
                                       valid_path=valid_datasets[0],
                                       test_path=valid_datasets[1],
                                       batch_size=batch_size,
                                       **opt_ds)
    else:
        train, valid, test = load_pkl_data(train_file_paths=pkl_train_files,
                                           valid_file_paths=pkl_valid_files,
                                           batch_size=batch_size,
                                           vocab=vocab,
                                           eyem=eyem,
                                           **opt_ds)

    tparams = init_tparams(params)
    trng, use_noise, inps_d, \
                     opt_ret, \
                     cost, errors, ent_errors, ent_derrors, probs = \
                        build_model(tparams,
                                    model_options,
                                    prepare_data if not opt_ds['use_sent_reps'] \
                                            else prepare_data_sents,
                                    valid,
                                    cost_mask=cost_mask)

    alphas = opt_ret['dec_alphas']

    if opt_ds['use_sent_reps']:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'],
                inps_d['slen'], inps_d['qlen'],\
                inps_d['ent_mask']
                ]
    else:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'], \
                inps_d['qlen'], \
                inps_d['ent_mask']]

    outs = [cost, errors, probs, alphas]
    if ent_errors:
        outs += [ent_errors]

    if ent_derrors:
        outs += [ent_derrors]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, outs, profile=profile)
    print 'Done'

    # Apply weight decay on the feed-forward connections
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.

        for kk, vv in tparams.iteritems():
            if "logit" in kk or "ff" in kk:
                weight_decay += (vv**2).sum()

        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print 'Computing gradient...',
    grads = safe_grad(cost, itemlist(tparams))
    print 'Done'

    # Gradient clipping:
    if clip_c > 0.:
        g2 = get_norms(grads)
        for p, g in grads.iteritems():
            grads[p] = tensor.switch(g2 > (clip_c**2),
                                     (g / tensor.sqrt(g2 + 1e-8)) * clip_c, g)
    inps.pop()
    if optimizer.lower() == "adasecant":
        learning_rule = Adasecant(delta_clip=25.0,
                                  use_adagrad=True,
                                  grad_clip=0.25,
                                  gamma_clip=0.)
    elif optimizer.lower() == "rmsprop":
        learning_rule = RMSPropMomentum(init_momentum=momentum)
    elif optimizer.lower() == "adam":
        learning_rule = Adam()
    elif optimizer.lower() == "adadelta":
        learning_rule = AdaDelta()

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    learning_rule = None

    if learning_rule:
        f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr,
                                                          grads=grads,
                                                          inp=inps,
                                                          cost=cost,
                                                          errors=errors)
    else:
        f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps,
                                                  cost, errors)

    print 'Done'
    print 'Optimization'
    history_errs = []
    # reload history
    if reload_ and os.path.exists(mpath):
        history_errs = list(numpy.load(mpath)['history_errs'])

    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size

    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    best_found = False
    uidx = 0
    estop = False

    train_cost_ave, train_err_ave, \
            train_gnorm_ave = reset_train_vals()

    for eidx in xrange(max_epochs):
        n_samples = 0

        if train.done:
            train.reset()

        for d_, q_, a, em in train:
            n_samples += len(a)
            uidx += 1
            use_noise.set_value(1.)

            if opt_ds['use_sent_reps']:
                # To mask the description and the question.
                d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents(
                    d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(
                    d, d_mask, q, q_mask, a, dlen, slen, qlen)
            else:
                d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(
                    d, d_mask, q, q_mask, a, dlen, qlen)

            upnorm = f_update(lrate)
            ud = time.time() - ud_start

            # Collect the running ave train stats.
            train_cost_ave = running_ave(train_cost_ave, cost)
            train_err_ave = running_ave(train_err_ave, errors)
            train_gnorm_ave = running_ave(train_gnorm_ave, gnorm)

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                import ipdb
                ipdb.set_trace()

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, ' Update ', uidx, \
                        ' Cost ', cost, ' UD ', ud, \
                        ' UpNorm ', upnorm[0].tolist(), \
                        ' GNorm ', gnorm, \
                        ' Pnorm ', pnorm, 'Terrors ', errors

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',
                if best_p is not None and best_found:
                    numpy.savez(mpath_best,
                                history_errs=history_errs,
                                **best_p)
                    pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb'))
                else:
                    params = unzip(tparams)

                numpy.savez(mpath, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % mpath, 'wb'))
                pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb'))

                print 'Done'
                print_param_norms(tparams)

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                if valid.done:
                    valid.reset()

                valid_costs, valid_errs, valid_probs, \
                        valid_alphas, error_ent, error_dent = eval_model(f_log_probs,
                                                  prepare_data if not opt_ds['use_sent_reps'] \
                                                    else prepare_data_sents,
                                                  model_options,
                                                  valid,
                                                  use_sent_rep=opt_ds['use_sent_reps'])

                valid_alphas_ = numpy.concatenate(
                    [va.argmax(0) for va in valid_alphas.tolist()], axis=0)
                valid_err = valid_errs.mean()
                valid_cost = valid_costs.mean()
                valid_alpha_ent = -negentropy(valid_alphas)

                mean_valid_alphas = valid_alphas_.mean()
                std_valid_alphas = valid_alphas_.std()

                mean_valid_probs = valid_probs.argmax(1).mean()
                std_valid_probs = valid_probs.argmax(1).std()

                history_errs.append([valid_cost, valid_err])

                stats['train_err_ave'].append(train_err_ave)
                stats['train_cost_ave'].append(train_cost_ave)
                stats['train_gnorm_ave'].append(train_gnorm_ave)

                stats['valid_errs'].append(valid_err)
                stats['valid_costs'].append(valid_cost)
                stats['valid_err_ent'].append(error_ent)
                stats['valid_err_desc_ent'].append(error_dent)

                stats['valid_alphas_mean'].append(mean_valid_alphas)
                stats['valid_alphas_std'].append(std_valid_alphas)
                stats['valid_alphas_ent'].append(valid_alpha_ent)

                stats['valid_probs_mean'].append(mean_valid_probs)
                stats['valid_probs_std'].append(std_valid_probs)

                if uidx == 0 or valid_err <= numpy.array(
                        history_errs)[:, 1].min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                    best_found = True
                else:
                    bst_found = False

                if numpy.isnan(valid_err):
                    import ipdb
                    ipdb.set_trace()

                print "============================"
                print '\t>>>Valid error: ', valid_err, \
                        ' Valid cost: ', valid_cost
                print '\t>>>Valid pred mean: ', mean_valid_probs, \
                        ' Valid pred std: ', std_valid_probs
                print '\t>>>Valid alphas mean: ', mean_valid_alphas, \
                        ' Valid alphas std: ', std_valid_alphas, \
                        ' Valid alpha negent: ', valid_alpha_ent, \
                        ' Valid error ent: ', error_ent, \
                        ' Valid error desc ent: ', error_dent

                print "============================"
                print "Running average train stats "
                print '\t>>>Train error: ', train_err_ave, \
                        ' Train cost: ', train_cost_ave, \
                        ' Train grad norm: ', train_gnorm_ave
                print "============================"


                train_cost_ave, train_err_ave, \
                    train_gnorm_ave = reset_train_vals()

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid.reset()
    valid_cost, valid_error, valid_probs, \
            valid_alphas, error_ent = eval_model(f_log_probs,
                                      prepare_data if not opt_ds['use_sent_reps'] \
                                           else prepare_data_sents,
                                      model_options, valid,
                                      use_sent_rep=opt_ds['use_sent_rep'])

    print " Final eval resuts: "
    print 'Valid error: ', valid_error.mean()
    print 'Valid cost: ', valid_cost.mean()
    print '\t>>>Valid pred mean: ', valid_probs.mean(), \
            ' Valid pred std: ', valid_probs.std(), \
            ' Valid error ent: ', error_ent

    params = copy.copy(best_p)

    numpy.savez(mpath_last,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err, valid_cost
Esempio n. 20
0
def load_model(path_to_model):
    """
    Load all model components
    """
    print path_to_model

    # Load model
    print 'Loading model'
    with open(path_to_model + '.pkl', 'rb') as f:
        model = pkl.load(f)

    options = model['options']
    options['use_topic'] = True
    # Load parameters
    print 'Loading model parameters...'
    params = init_params(options)
    params = load_params(path_to_model + '.npz', params)
    tparams = init_tparams(params)

    # Extractor functions
    print 'Compiling sentence encoder...'
    [x, x_mask], sentences = build_sentence_encoder(tparams, options)
    f_senc = theano.function([x, x_mask], sentences, name='f_senc')

    #print 'Compiling sentence encoder with topics...'
    #[x, x_mask, topics], sentences = build_sentence_encoder_with_topicvector(tparams, options)
    #f_senc_t = theano.function([x, x_mask, topics], sentences, name='f_senc_t')

    print 'Compiling image encoder...'
    [im], images = build_image_encoder(tparams, options)
    f_ienc = theano.function([im], images, name='f_ienc')

    print 'Compiling topic encoder...'
    [t], topics = build_topic_encoder(tparams, options)
    f_tenc = theano.function([t], topics, name='f_tenc')
    '''
    print 'Compiling topic_vector1 encoder...'
    [t], topics = build_topic_vector1_encoder(tparams, options)
    f_tv1enc = theano.function([t], topics, name='f_tv1enc')

    print 'Compiling topic_vector2 encoder...'
    [t], topics = build_topic_encoder(tparams, options)
    f_tv2enc = theano.function([t], topics, name='f_tv2enc')
    '''
    print 'Compiling error computation...'
    [s, im], errs = build_errors(options)
    f_err = theano.function([s, im], errs, name='f_err')
    '''
    [s, im, t], errs_t1 = build_errors_3level(options)
    f_err_t1 = theano.function([s,im,t], errs_t1, name='f_err_t1')
    
    [s_t, im_t], errs_t2 = build_errors_t2(options)
    f_err_t2 = theano.function([s_t, im_t], errs_t2, name='f_err_t2')
    '''
    # Store everything we need in a dictionary
    print 'Packing up...'
    model['f_senc'] = f_senc
    #model['f_senc_t'] = f_senc_t
    model['f_ienc'] = f_ienc
    model['f_tenc'] = f_tenc
    #model['f_tv1enc'] = f_tv1enc
    #model['f_tv2enc'] = f_tv2enc
    model['f_err'] = f_err
    #model['f_err_t1'] = f_err_t1
    #model['f_err_t2'] = f_err_t2

    return model
Esempio n. 21
0
    def __init__(self, num_hidden, num_features, mb_size,
                 hidden_state_features, target):
        self.mb_size = mb_size
        #self.seq_length = seq_length

        #using 0.8
        hidden_state_features = dropout(hidden_state_features, 1.0)

        gru_params_1 = init_tparams(
            param_init_gru(None, {},
                           prefix="gru1",
                           dim=num_hidden,
                           nin=num_features))
        gru_params_2 = init_tparams(
            param_init_gru(None, {},
                           prefix="gru2",
                           dim=num_hidden,
                           nin=num_hidden + num_features))

        gru_1_out = gru_layer(gru_params_1,
                              hidden_state_features,
                              None,
                              prefix='gru1',
                              gradient_steps=100)[0]
        gru_2_out = gru_layer(gru_params_2,
                              T.concatenate([gru_1_out, hidden_state_features],
                                            axis=2),
                              None,
                              prefix='gru2',
                              backwards=True,
                              gradient_steps=100)[0]

        self.gru_1_out = gru_1_out

        final_out_recc = T.mean(gru_2_out, axis=0)

        h_out_1 = DenseLayer((mb_size * 2, num_hidden),
                             num_units=num_hidden,
                             nonlinearity=lasagne.nonlinearities.rectify)
        h_out_2 = DenseLayer((mb_size * 2, num_hidden),
                             num_units=num_hidden,
                             nonlinearity=lasagne.nonlinearities.rectify)
        h_out_4 = DenseLayer((mb_size * 2, num_hidden),
                             num_units=1,
                             nonlinearity=None)

        h_out_1_value = dropout(h_out_1.get_output_for(final_out_recc), 1.0)
        h_out_2_value = dropout(h_out_2.get_output_for(h_out_1_value), 1.0)
        h_out_4_value = h_out_4.get_output_for(h_out_2_value)

        raw_y = T.clip(h_out_4_value, -10.0, 10.0)

        classification = T.nnet.sigmoid(raw_y)

        self.accuracy = T.mean(
            T.eq(target,
                 T.gt(classification, 0.5).flatten()))

        p_real = classification[0:mb_size]
        p_gen = classification[mb_size:mb_size * 2]

        self.d_cost_real = bce(p_real, T.ones(p_real.shape)).mean()
        self.d_cost_gen = bce(p_gen, T.zeros(p_gen.shape)).mean()

        self.g_cost_real = bce(p_real, T.zeros(p_gen.shape)).mean()
        self.g_cost_gen = bce(p_gen, T.ones(p_real.shape)).mean()

        #self.g_cost = self.g_cost_gen
        self.g_cost = self.g_cost_real + self.g_cost_gen

        print "pulling both TF and PF togeher"

        self.d_cost = self.d_cost_real + self.d_cost_gen
        #if d_cost < 1.0, use g cost.

        self.d_cost = T.switch(
            T.gt(self.accuracy, 0.95) * T.gt(p_real.mean(), 0.99) *
            T.lt(p_gen.mean(), 0.01), 0.0, self.d_cost)
        '''
        gX = gen(Z, *gen_params)

        p_real = discrim(X, *discrim_params)
        p_gen = discrim(gX, *discrim_params)

        d_cost_real = bce(p_real, T.ones(p_real.shape)).mean()
        d_cost_gen = bce(p_gen, T.zeros(p_gen.shape)).mean()
        g_cost_d = bce(p_gen, T.ones(p_gen.shape)).mean()

        d_cost = d_cost_real + d_cost_gen
        g_cost = g_cost_d

        cost = [g_cost, d_cost, g_cost_d, d_cost_real, d_cost_gen]
        d_updates = d_updater(discrim_params, d_cost)
        g_updates = g_updater(gen_params, g_cost)

        '''

        self.classification = classification

        self.params = []
        self.params += lasagne.layers.get_all_params(h_out_4, trainable=True)
        self.params += lasagne.layers.get_all_params(h_out_1, trainable=True)
        self.params += lasagne.layers.get_all_params(h_out_2, trainable=True)

        #self.params += h_out_1.getParams() + h_out_2.getParams() + h_out_3.getParams()

        #        self.params += lasagne.layers.get_all_params(h_initial_1,trainable=True)
        #        self.params += lasagne.layers.get_all_params(h_initial_2,trainable=True)

        self.params += gru_params_1.values()
        self.params += gru_params_2.values()
        '''
        layerParams = c1.getParams()
        for paramKey in layerParams:
            self.params += [layerParams[paramKey]]
        layerParams = c2.getParams()
        for paramKey in layerParams:
            self.params += [layerParams[paramKey]]
        layerParams = c3.getParams()
        for paramKey in layerParams:
            self.params += [layerParams[paramKey]]

        '''

        #all_grads = T.grad(self.loss, self.params)
        #for j in range(0, len(all_grads)):
        #    all_grads[j] = T.switch(T.isnan(all_grads[j]), T.zeros_like(all_grads[j]), all_grads[j])
        #self.updates = lasagne.updates.adam(all_grads, self.params, learning_rate = 0.0001, beta1 = 0.5)
        '''
Esempio n. 22
0
            mask = mask.reshape((mask.shape[0], \
                                 mask.shape[1]*mask.shape[2])).dimshuffle(0, 1, 'x')
        elif mask.ndim == 2:
            mask = mask.dimshuffle(0, 1, 'x')

        rval, updates = theano.scan(_step,
                                    sequences=[mask, lstm_state_below],
                                    outputs_info=[init_state, init_memory],
                                    name=prfx(prefix, '_layers'),
                                    non_sequences=non_seqs,
                                    strict=True,
                                    n_steps=nsteps)
    return rval


if __name__ == "__main__":
    print "setting up gru"

    params = {}
    gru_params = param_init_gru({},
                                param=params,
                                prefix='gru',
                                nin=1024,
                                dim=1024)

    tparams = init_tparams(params)

    import numpy as np

    #h_out = gru_layer(tparams,state_below,options={},prefix='gru',mask=None,one_step=True,init_state=None,backwards=False)
Esempio n. 23
0
def main(model_path, data_base_path, option_path, saveto, k):

    # load model_options
    with io.open(option_path, encoding='utf8') as f:
        config = json.load(f)

    model_options = config['model']
    test_data_options = config['testdata']
    label_vocab_size = test_data_options['n_labels']
    assert 'reverse_labels' in config['data']
    reverse_labels = config['data']['reverse_labels']

    def join_data_base_path(data_base, options):
        for kk, vv in six.iteritems(options):
            if kk in ['src', 'trg', 'input_vocab', 'label_vocab']:
                options[kk] = os.path.join(data_base, options[kk])

        return options

    test_data_options = join_data_base_path(data_base_path, test_data_options)
    dicts_r, test_stream = load_test_data(**test_data_options)

    word_vocab = load_dict(test_data_options['input_vocab'])
    iword_vocab = dict((vv, kk) for kk, vv in six.iteritems(word_vocab))
    label_vocab = load_dict(test_data_options['label_vocab'],
                            dict_size=label_vocab_size,
                            include_unk=False,
                            reverse=reverse_labels)
    ilabel_vocab = dict((vv, kk) for kk, vv in six.iteritems(label_vocab))

    model_options['n_labels'] = len(label_vocab)

    LOGGER.info('Building model')
    params = init_params(model_options)

    LOGGER.info('Loading parameters from {}'.format(model_path))
    params = load_params(model_path, params)

    LOGGER.info('Initializing parameters')
    tparams = init_tparams(params)

    # use_noise is for dropout
    use_noise = theano.shared(numpy.float32(0.))
    trng = MRG_RandomStreams(1234)

    n_samples = 0

    LOGGER.info('Building sampler')
    f_sample_inits, f_sample_nexts \
        = build_sampler(tparams, model_options, trng, use_noise)

    results = dict()
    results['input_vocab'] = iword_vocab
    results['label_vocab'] = ilabel_vocab
    results['src'] = dict()
    results['predictions'] = dict()
    results['targets'] = dict()
    results['alignments'] = dict()

    for x, x_mask, y, y_mask in test_stream.get_epoch_iterator():
        orig_x = x
        if model_options['label_type'] == 'binary':
            y, y_mask = mul2bin(y, y_mask, model_options['n_bins'])

        x, x_mask = x.T, x_mask.T

        if model_options['enc_dir'] == 'none':
            x_mask[(x == 0) | (x == 1)] = 0.

        for jj in xrange(x.shape[1]):
            sample_encoder_inps = [x[:, jj][:, None], x_mask[:, jj][:, None]]

            solutions = gen_sample(tparams,
                                   f_sample_inits,
                                   f_sample_nexts,
                                   sample_encoder_inps,
                                   model_options,
                                   trng=trng,
                                   k=k,
                                   max_label_len=50,
                                   argmax=False)

            samples = solutions['samples']
            alignment = solutions['alignments']
            scores = solutions['scores']

            scores = scores / numpy.array([len(s) for s in samples])
            best_sample = samples[scores.argmin()]
            best_alignment = alignment[scores.argmin()]

            results['src'][n_samples + jj] = orig_x[jj]
            results['predictions'][n_samples + jj] = best_sample
            results['alignments'][n_samples + jj] = numpy.array(best_alignment)
            results['targets'][n_samples + jj] = y[jj, y_mask[jj] == 1]

        n_samples += x.shape[1]
        LOGGER.info('Number of processed instances: {}'.format(n_samples))

    LOGGER.info(
        'Making predictions successfully on {} instances'.format(n_samples))
    savez(results, saveto, 2)
Esempio n. 24
0
def train(dim_word=100,  # word vector dimensionality
          dim=1000,  # the number of GRU units
          encoder='gru',
          patience=10,  # early stopping patience
          max_epochs=5000,
          finish_after=10000000,  # finish after this many updates
          dispFreq=100,
          decay_c=0.,  # L2 weight decay penalty
          lrate=0.01,
          n_words=100000,  # vocabulary size
          maxlen=100,  # maximum length of the description
          optimizer='rmsprop',
          batch_size=16,
          valid_batch_size=16,
          saveto='model.npz',
          validFreq=1000,
          saveFreq=1000,  # save the parameters after every saveFreq updates
          sampleFreq=100,  # generate some samples after every sampleFreq
          dataset='/data/lisatmp4/anirudhg/wiki.tok.txt.gz',
          valid_dataset='/data/lisatmp4/anirudhg/newstest2011.en.tok',
          dictionary='/data/lisatmp4/anirudhg/wiki.tok.txt.gz.pkl',
          use_dropout=False,
          reload_=False):

    # Model options
    model_options = locals().copy()

    # load dictionary
    with open(dictionary, 'rb') as f:
        worddicts = pkl.load(f)

    # invert dictionary
    worddicts_r = dict()
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)

    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    # create shared variables for parameters
    tparams = init_tparams(params)

    # build the symbolic computational graph
    trng, use_noise, \
        x, x_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask]

    print 'Buliding sampler'
    f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams,
                                                             grads, inps, cost)

    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0])/batch_size
    if saveFreq == -1:
        saveFreq = len(train[0])/batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0])/batch_size

    # Training loop
    uidx = 0
    estop = False
    bad_counter = 0
    for eidx in xrange(max_epochs):
        n_samples = 0

        for x in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            # pad batch and create mask
            x, x_mask = prepare_data(x, maxlen=maxlen, n_words=n_words)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(5):
                    sample, score = gen_sample(tparams, f_next,
                                               model_options, trng=trng,
                                               maxlen=30, argmax=False)
                    print 'Sample ', jj, ': ',
                    ss = sample
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r:
                            print worddicts_r[vv],
                        else:
                            print 'UNK',
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print 'Valid ', valid_err

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data,
                           model_options, valid).mean()

    print 'Valid ', valid_err

    params = copy.copy(best_p)
    numpy.savez(saveto, zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err
Esempio n. 25
0
def train(worker, model_options, data_options,
          patience,  # early stopping patience
          max_epochs,
          finish_after,  # finish after this many updates
          decay_c,  # L2 regularization penalty
          alpha_c,  # alignment regularization
          clip_c,  # gradient clipping threshold
          lrate,  # learning rate
          optimizer,
          saveto,
          valid_freq,
          train_len,
          valid_sync,
          save_freq,   # save the parameters after every saveFreq updates
          sample_freq,   # generate some samples after every sampleFreq
          control_port,
          batch_port,
          log_port,
          reload_):

    LOGGER.info('Connecting to data socket ({}) and loading validation data'
                .format(batch_port))
    worker.init_mb_sock(batch_port)
    _, _, valid_stream = load_data(**data_options)

    LOGGER.info('Building model')
    params = init_params(model_options)
    # reload parameters
    experiment_id = worker.send_req('experiment_id')
    model_filename = '{}.model.npz'.format(experiment_id)
    saveto_filename = '{}.npz'.format(saveto)
    if reload_ and os.path.exists(saveto_filename):
        LOGGER.info('Loading parameters from {}'.format(saveto_filename))
        params = load_params(saveto_filename, params)

    LOGGER.info('Initializing parameters')
    tparams = init_tparams(params)
    alpha = worker.send_req('alpha')
    worker.init_shared_params(tparams.values(), param_sync_rule=EASGD(alpha))

    # use_noise is for dropout
    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    LOGGER.info('Building sampler')
    f_init, f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    LOGGER.info('Building f_log_probs')
    f_log_probs = theano.function(inps, cost, profile=False)

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in six.iteritems(tparams):
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * ((tensor.cast(
            y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
            opt_ret['dec_alphas'].sum(0)) ** 2).sum(1).mean()
        cost += alpha_reg

    # Not used?
    # after all regularizers - compile the computational graph for cost
    # LOGGER.info('Building f_cost')
    # f_cost = theano.function(inps, cost, profile=False)

    LOGGER.info('Computing gradient')
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g ** 2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(
                g2) * clip_c, g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    LOGGER.info('Building optimizers')
    f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams,
                                                             grads, inps, cost)

    LOGGER.info('Optimization')

    log = RemoteLogger(port=log_port)
    train_start = time.clock()
    best_p = None

    # Making sure that the worker start training with the most recent params
    worker.copy_to_local()

    uidx = 0
    while True:
        step = worker.send_req('next')
        LOGGER.debug('Received command: {}'.format(step))
        if step == 'train':
            use_noise.set_value(1.)
            for i in xrange(train_len):
                x, x_mask, y, y_mask = worker.recv_mb()

                uidx += 1
                log_entry = {'iteration': uidx}

                # compute cost, grads and copy grads to shared variables
                update_start = time.clock()
                cost = f_grad_shared(x, x_mask, y, y_mask)
                f_update(lrate)

                log_entry['cost'] = float(cost)
                log_entry['average_source_length'] = \
                    float(x_mask.sum(0).mean())
                log_entry['average_target_length'] = \
                    float(y_mask.sum(0).mean())
                log_entry['update_time'] = time.clock() - update_start
                log_entry['train_time'] = time.clock() - train_start
                log_entry['time'] = time.time()
                log.log(log_entry)

            step = worker.send_req({'done': train_len})
            LOGGER.debug("Syncing with global params")
            worker.sync_params(synchronous=True)

        if step == 'valid':
            if valid_sync:
                worker.copy_to_local()
            use_noise.set_value(0.)
            valid_errs = pred_probs(f_log_probs, model_options, valid_stream)
            valid_err = float(valid_errs.mean())
            res = worker.send_req({'valid_err': valid_err})
            log.log({'validation_cost': valid_err,
                     'train_time': time.clock() - train_start,
                     'time': time.time()})

            if res == 'best' and saveto:
                best_p = unzip(tparams)
                save_params(best_p, model_filename, saveto_filename)

            if valid_sync:
                worker.copy_to_local()

        if step == 'stop':
            break

    # Release all shared ressources.
    worker.close()
Esempio n. 26
0
def train(
        dim_word=100,  # word vector dimensionality
        dim=1000,  # the number of GRU units
        encoder='gru',
        patience=10,  # early stopping patience
        max_epochs=5000,
        finish_after=10000000,  # finish after this many updates
        dispFreq=100,
        decay_c=0.,  # L2 weight decay penalty
        lrate=0.01,
        n_words=100000,  # vocabulary size
        maxlen=100,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=100,  # generate some samples after every sampleFreq
        dataset='/data/lisatmp4/anirudhg/wiki.tok.txt.gz',
        valid_dataset='/data/lisatmp4/anirudhg/newstest2011.en.tok',
        dictionary='/data/lisatmp4/anirudhg/wiki.tok.txt.gz.pkl',
        use_dropout=False,
        reload_=False):

    # Model options
    model_options = locals().copy()

    # load dictionary
    with open(dictionary, 'rb') as f:
        worddicts = pkl.load(f)

    # invert dictionary
    worddicts_r = dict()
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)

    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    # create shared variables for parameters
    tparams = init_tparams(params)

    # build the symbolic computational graph
    trng, use_noise, \
        x, x_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask]

    print 'Buliding sampler'
    f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams,
                                                             grads, inps, cost)

    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0]) / batch_size

    f = open('workfile', 'w')
    # Training loop
    # generate some samples with the model and display them
    for jj in xrange(5000):
        sample, score = gen_sample(tparams,
                                   f_next,
                                   model_options,
                                   trng=trng,
                                   maxlen=30,
                                   argmax=False)
        print 'Sample ', jj, ': ',
        ss = sample
        for vv in ss:
            if vv == 0:
                break
            if vv in worddicts_r:
                print worddicts_r[vv],
                f.write(worddicts_r[vv])
                f.write(' ')
            else:
                print 'UNK',
                print
                f.write('UNK')
                f.write(' ')

        print '\n'
        f.write('\n')

    f.close()
Esempio n. 27
0
def train(
        experiment_id,
        model_options,
        data_options,
        validation_options,
        patience,  # early stopping patience
        max_epochs,
        finish_after,  # finish after this many updates
        decay_c,  # L2 regularization penalty
        alpha_c,  # alignment regularization
        clip_c,  # gradient clipping threshold
        lrate,  # learning rate
        optimizer,
        saveto,
        valid_freq,
        eval_intv,  # time interval for evaluation in minutes
        save_freq,  # save the parameters after every saveFreq updates
        sample_freq,  # generate some samples after every sampleFreq
        reload_=False):

    worddicts_r, train_stream, valid_stream = load_data(**data_options)

    LOGGER.info('Building model')
    params = init_params(model_options)
    # reload parameters
    model_filename = '{}.model.npz'.format(experiment_id)
    model_option_filename = '{}.config.json'.format(experiment_id)
    saveto_filename = '{}.npz'.format(saveto)
    if reload_ and os.path.exists(saveto_filename):
        LOGGER.info('Loading parameters from {}'.format(saveto_filename))
        params = load_params(saveto_filename, params)

    LOGGER.info('Initializing parameters')
    tparams = init_tparams(params)

    # use_noise is for dropout
    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    LOGGER.info('Building sampler')
    f_init, f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    LOGGER.info('Building f_log_probs')
    f_log_probs = theano.function(inps, cost, profile=False)

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in six.iteritems(tparams):
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # Not used?
    # after all regularizers - compile the computational graph for cost
    # LOGGER.info('Building f_cost')
    # f_cost = theano.function(inps, cost, profile=False)

    LOGGER.info('Computing gradient')
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    LOGGER.info('Building optimizers')
    f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams,
                                                             grads, inps, cost)

    LOGGER.info('Optimization')

    log = Logger(filename='{}.log.jsonl.gz'.format(experiment_id))

    # evaluation score will be stored into the following queue
    valid_ret_queue = Queue.Queue()
    process_queue = Queue.Queue()

    rt = prepare_validation_timer(tparams, process_queue, model_filename,
                                  model_option_filename, eval_intv,
                                  valid_ret_queue, **validation_options)
    rt.start()

    def _timer_signal_handler(signum, frame):
        LOGGER.info('Received SIGINT')
        LOGGER.info('Now attempting to stop the timer')
        rt.stop()

        LOGGER.info('Please wait for terminating all child processes')
        while not process_queue.empty():
            proc = process_queue.get()
            if proc.poll() is None:  # check if the process has terminated
                # child process is still working
                # LOGGER.info('Attempt to kill', proc.pid)
                # terminate it by sending an interrupt signal
                proc.send_signal(signal.SIGINT)

                # wait for child process while avoiding deadlock
                # ignore outputs
                proc.communicate()

        sys.exit(130)

    signal.signal(signal.SIGINT, _timer_signal_handler)

    train_start = time.clock()
    best_p = None
    best_score = 0
    bad_counter = 0

    uidx = 0
    estop = False

    for eidx in xrange(max_epochs):
        n_samples = 0

        for x, x_mask, y, y_mask in train_stream.get_epoch_iterator():
            n_samples += len(x)
            x, x_mask, y, y_mask = x.T, x_mask.T, y.T, y_mask.T

            use_noise.set_value(1.)

            uidx += 1
            log_entry = {'iteration': uidx, 'epoch': eidx}

            # compute cost, grads and copy grads to shared variables
            update_start = time.clock()
            cost = f_grad_shared(x, x_mask, y, y_mask)
            f_update(lrate)

            log_entry['cost'] = float(cost)
            log_entry['average_source_length'] = float(x_mask.sum(0).mean())
            log_entry['average_target_length'] = float(y_mask.sum(0).mean())
            log_entry['update_time'] = time.clock() - update_start
            log_entry['train_time'] = time.clock() - train_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if not numpy.isfinite(cost):
                LOGGER.error('NaN detected')
                return 1., 1., 1.

            # save the best model so far
            if numpy.mod(uidx, save_freq) == 0:
                LOGGER.info('Saving best model so far')

                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)

                # save params to exp_id.npz and symlink model.npz to it
                save_params(params, model_filename, saveto_filename)

            # generate some samples with the model and display them
            if numpy.mod(uidx, sample_freq) == 0:
                # FIXME: random selection?
                log_entry['samples'] = []
                for jj in xrange(numpy.minimum(5, x.shape[1])):
                    log_entry['samples'].append({
                        'source': '',
                        'truth': '',
                        'sample': ''
                    })
                    stochastic = True
                    sample, _, score = gen_sample(tparams,
                                                  f_init,
                                                  f_next,
                                                  x[:, jj][:, None],
                                                  model_options,
                                                  trng=trng,
                                                  k=1,
                                                  maxlen=30,
                                                  stochastic=stochastic,
                                                  argmax=False)
                    for vv in x[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[0]:
                            token = worddicts_r[0][vv]
                        else:
                            token = UNK_TOKEN
                        log_entry['samples'][-1]['source'] += token + ' '
                    for vv in y[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            token = worddicts_r[1][vv]
                        else:
                            token = UNK_TOKEN
                        log_entry['samples'][-1]['truth'] += token + ' '
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            token = worddicts_r[1][vv]
                        else:
                            token = UNK_TOKEN
                        log_entry['samples'][-1]['sample'] += token + ' '

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, valid_freq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, model_options,
                                        valid_stream)
                valid_err = valid_errs.mean()
                log_entry['validation_cost'] = float(valid_err)

                if not numpy.isfinite(valid_err):
                    raise RuntimeError('NaN detected in validation error')

            # collect validation scores (e.g., BLEU) from the child thread
            if not valid_ret_queue.empty():

                (ret_model, scores) = valid_ret_queue.get()

                valid_bleu = scores[0]
                # LOGGER.info('BLEU on the validation set: %.2f' % valid_bleu)
                log_entry['validation_bleu'] = valid_bleu

                if valid_bleu > best_score:
                    best_p = ret_model
                    best_score = valid_bleu
                    bad_counter = 0
                else:
                    bad_counter += 1
                    if bad_counter > patience:
                        estop = True
                        break

            # finish after this many updates
            if uidx >= finish_after:
                LOGGER.info('Finishing after {} iterations'.format(uidx))
                estop = True
                break

            log.log(log_entry)

        LOGGER.info('Completed epoch, seen {} samples'.format(n_samples))

        if estop:
            log.log(log_entry)
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    LOGGER.info('Calculating validation cost')
    valid_err = pred_probs(f_log_probs, model_options, valid_stream).mean()

    if not best_p:
        best_p = unzip(tparams)

    params = copy.copy(best_p)
    save_params(params, model_filename, saveto_filename)

    rt.stop()

    return valid_err
Esempio n. 28
0
def test(model_options_file='model_options.pkl',
         model_file='model_best_so_far.npz'):
    from_dir = 'model_files/'
    print 'preparing reload'
    model_options = utils.load_pkl(from_dir + model_options_file)

    print 'Loading data'
    engine = data_engine.Movie2Caption(
        'attention', model_options['dataset'], model_options['video_feature'],
        model_options['batch_size'], model_options['valid_batch_size'],
        model_options['maxlen'], model_options['n_words'], model_options['K'],
        model_options['OutOf'])

    print 'init params'
    t0 = time.time()
    model = Model()
    params = model.init_params(model_options)

    model_saved = from_dir + model_file
    assert os.path.isfile(model_saved)
    print "Reloading model params..."
    params = utils.load_params(model_saved, params)
    tparams = utils.init_tparams(params)
    print tparams.keys

    print 'buliding sampler'
    use_noise = theano.shared(numpy.float32(0.))
    use_noise.set_value(0.)
    trng = RandomStreams(1234)
    f_init, f_next = model.build_sampler(tparams, model_options, use_noise,
                                         trng)

    print 'start test...'
    blue_t0 = time.time()
    scores, processes, queue, rqueue, shared_params = \
                    metrics.compute_score(
                    model_type='attention',
                    model_archive=params,
                    options=model_options,
                    engine=engine,
                    save_dir=from_dir,
                    beam=5, n_process=5,
                    whichset='both',
                    on_cpu=False,
                    processes=None, queue=None, rqueue=None,
                    shared_params=None, metric=model_options['metric'],
                    one_time=False,
                    f_init=f_init, f_next=f_next, model=model
                    )

    valid_B1 = scores['valid']['Bleu_1']
    valid_B2 = scores['valid']['Bleu_2']
    valid_B3 = scores['valid']['Bleu_3']
    valid_B4 = scores['valid']['Bleu_4']
    valid_Rouge = scores['valid']['ROUGE_L']
    valid_Cider = scores['valid']['CIDEr']
    valid_meteor = scores['valid']['METEOR']
    test_B1 = scores['test']['Bleu_1']
    test_B2 = scores['test']['Bleu_2']
    test_B3 = scores['test']['Bleu_3']
    test_B4 = scores['test']['Bleu_4']
    test_Rouge = scores['test']['ROUGE_L']
    test_Cider = scores['test']['CIDEr']
    test_meteor = scores['test']['METEOR']
    print 'computing meteor/blue score used %.4f sec, '\
          'B@1: %.3f, B@2: %.3f, B@3: %.3f, B@4: %.3f, M: %.3f'%(
    time.time()-blue_t0, test_B1, test_B2, test_B3, test_B4, test_meteor)
Esempio n. 29
0
def main(model_options):
  
  print 'Loading data'
  dp = data_provider()
  dp.load_data(model_options['batch_size'], model_options['word_count_threshold'])
  dp.build_word_vocab()
  dp.group_train_captions_by_length()
  model_options['vocab_size'] = dp.get_word_vocab_size()

  print 'Building model'  
  # This create the initial parameters as numpy ndarrays.
  generator = caption_generator()
  params = generator.init_params(model_options)
  save_n = {}
  save_n['checkpoint'] = 0
  save_n['prediction'] = 0
  
  # reload a saved checkpoint
  if model_options['reload_checkpoint_path']:
    _, save_n['checkpoint'] = utils.load_params(model_options['reload_checkpoint_path'], params)
    print 'Reloaded checkpoint from', model_options['reload_checkpoint_path']
  
  # This create Theano Shared Variable from the parameters.
  # Dict name (string) -> Theano Tensor Shared Variable
  # params and tparams have different copy of the weights.
  tparams = utils.init_tparams(params)
  
  # use_noise is for dropout
  sents, mask, imgs, gt_sents, use_noise, cost = generator.build_model(tparams)
  grads = tensor.grad(cost, wrt=tparams.values())
  
  lr = tensor.scalar(name='lr')
  f_grad_shared, f_update = optimizers[model_options['optimizer']](lr, tparams, grads, sents, mask, imgs, gt_sents, cost)
  
  imgs_to_predict, predicted_indices, predicted_prob = generator.predict(tparams)
  f_pred = theano.function([imgs_to_predict], predicted_indices, name='f_pred')
  f_pred_prob = theano.function([imgs_to_predict], predicted_prob, name='f_pred_prob')
    
  train_iter = dp.train_iterator
  kf_valid = KFold(len(dp.split['val']), n_folds=len(dp.split['val']) / model_options['batch_size'], shuffle=False)
  
  if model_options['use_dropout'] == 1:
    use_noise.set_value(1.)
  else:
    use_noise.set_value(0.)
     
  print 'Optimization'
  
  uidx = 0
  lrate = model_options['lrate']
  # display print time duration
  dp_start = time.time()
  for eidx in xrange(model_options['max_epochs']):
    print 'Epoch ', eidx
    
    for batch_data in train_iter:
      uidx += 1
      
      # preparing the mini batch data
      pd_start = time.time()
      sents, sents_mask, imgs, gt_sents = dp.prepare_train_batch_data(batch_data)
      pd_duration = time.time() - pd_start
      
      if sents is None:
        print 'Minibatch is empty'
        continue
      
      # training on the mini batch
      ud_start = time.time()
      cost = f_grad_shared(sents, sents_mask, imgs, gt_sents)
      f_update(lrate)
      ud_duration = time.time() - ud_start
      
      # Numerical stability check
      if numpy.isnan(cost) or numpy.isinf(cost):
        print 'NaN detected'
      
      if numpy.mod(uidx, model_options['disp_freq']) == 0:
        dp_duration = time.time() - dp_start
        print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'Prepare data ', pd_duration, 'Update data ', ud_duration, '{0}_iter_time {1}'.format(model_options['disp_freq'], dp_duration)
        dp_start = time.time()

      # Log validation loss + checkpoint the model with the best validation log likelihood
      if numpy.mod(uidx, model_options['valid_freq']) == 0:
        scores = validate_and_save_checkpoint(model_options, dp, params, tparams, f_pred, f_pred_prob, kf_valid, save_n)
        print scores
  
  print 'Performing final validation'
  scores = validate_and_save_checkpoint(model_options, dp, params, tparams, f_pred, f_pred_prob, kf_valid, save_n)
  print scores
  print 'Done!!!'
Esempio n. 30
0
def train(
        experiment_id,
        data_base_path,
        output_base_path,
        model_options,
        data_options,
        validation_options,
        patience,  # early stopping patience
        max_epochs,
        finish_after,  # finish after this many updates
        clip_c,  # gradient clipping threshold
        lrate,  # learning rate
        optimizer,
        saveto,
        valid_freq,
        time_limit,
        save_freq,  # save the parameters after every saveFreq updates
        sample_freq,  # generate some samples after every sampleFreq
        verbose,
        reload_from=None,
        pretrained_word_emb=None):

    start_time = time.time()

    def join_data_base_path(data_base, options):
        for kk, vv in six.iteritems(options):
            if kk in [
                    'src', 'trg', 'input_vocab', 'label_vocab', 'valid_src',
                    'valid_trg'
            ]:
                options[kk] = os.path.join(data_base, options[kk])

        return options

    data_options = join_data_base_path(data_base_path, data_options)
    validation_options = join_data_base_path(data_base_path,
                                             validation_options)

    worddicts_r, train_stream, valid_stream = load_data(**data_options)

    model_options['n_input_tokens'] = len(worddicts_r[0])
    model_options['n_labels'] = len(worddicts_r[1])

    if model_options['label_type'] == 'binary':
        model_options['n_bins'] = len(worddicts_r[1])
        model_options['n_labels'] = 2
        max_sample_length = len(worddicts_r[1])
    else:
        max_sample_length = data_options['max_label_length']

    LOGGER.info('Building model')
    params = init_params(model_options)
    # reload parameters
    best_filename = '{}/{}.{}.best.npz'.format(output_base_path, experiment_id,
                                               saveto)

    if pretrained_word_emb and os.path.exists(pretrained_word_emb):
        assert model_options['input_token_level'] == 'word'
        LOGGER.info('Loading pretrained word embeddings from {}'.format(
            pretrained_word_emb))

        pretrained_emb = load_pretrained_embeddings(pretrained_word_emb)

        # TODO check if the size of the pretrained word embedding equals
        # the size of the initialized word embeddings
        # Also, check whether or not the vocabulary in the pretrained word
        # embeddings is identical to the vocabulary in the model.
        pvocab = pretrained_emb['vocab']  # (idx, word)

        # XXX if the assertians passed, then load the pretrained embeddings
        assert pretrained_emb['Wemb'].dtype == numpy.float32, \
            'The pretrained word embeddings should be float32\n'
        assert pretrained_emb['Wemb'].shape[1] == params['Wemb'].shape[1], \
            '{} does not match {}\n'.format(pretrained_emb['Wemb'].shape[1],
                                            params['Wemb'].shape[1])

        pretrained_word2id = {word: idx for (idx, word) in pvocab}

        param_indices, indices = [], []
        for ii in xrange(len(worddicts_r[0])):

            if ii >= data_options['n_input_tokens']:
                break

            word = worddicts_r[0][ii]
            if word in pretrained_word2id:
                word_idx = pretrained_word2id[word]
                indices.append(word_idx)
                param_indices.append(ii)

        assert len(indices) <= data_options['n_input_tokens']

        params['Wemb'][param_indices] = pretrained_emb['Wemb'][indices]
        # normalize word embeddings
        params['Wemb'] = params['Wemb'] / \
            numpy.sqrt((params['Wemb']**2).sum(axis=1)[:, None])

    if reload_from and os.path.exists(reload_from):
        LOGGER.info('Loading parameters from {}'.format(reload_from))
        params = load_params(reload_from, params)

    LOGGER.info('Initializing parameters')
    tparams = init_tparams(params)

    # use_noise is for dropout
    trng, use_noise, encoder_vars, decoder_vars, \
        opt_ret, costs = build_model(tparams, model_options)

    inps = encoder_vars + decoder_vars

    LOGGER.info('Building sampler')
    f_sample_inits, f_sample_nexts \
        = build_sampler(tparams, model_options, trng, use_noise)

    # before any regularizer
    LOGGER.info('Building functions to compute log prob')
    f_log_probs = [
        theano.function(inps,
                        cost_,
                        name='f_log_probs_%s' % cost_.name,
                        on_unused_input='ignore') for cost_ in costs
    ]

    assert len(costs) == 1

    cost = costs[0]
    '''
    for cost_ in costs[1:]:
        cost += cost_
    '''
    cost = cost.mean()

    LOGGER.info('Computing gradient')
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')

    LOGGER.info('Building optimizers')
    f_grad_shared, f_update, optimizer_state = \
        getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost)

    optimizer_state = name_dict(optimizer_state)

    # TODO set_value optimizer_state
    if reload_from and os.path.exists(reload_from):
        LOGGER.info('Loading optimizer state from {}'.format(reload_from))
        optimizer_state = load_params(reload_from,
                                      optimizer_state,
                                      theano_var=True)

    LOGGER.info('Optimization')

    log = Logger(
        filename='{}/{}.log.jsonl.gz'.format(output_base_path, experiment_id))

    best_valid_err = float('inf')
    best_model = None
    total_nsamples = 0
    uidx = 0
    uidx_restore = [0]
    estop = False
    if reload_from and os.path.exists(reload_from):
        rmodel = numpy.load(reload_from)
        if 'uidx' in rmodel:
            uidx_restore = rmodel['uidx']
        if 'best_valid_err' in rmodel:
            best_valid_err = rmodel['best_valid_err']
        if 'total_nsamples' in rmodel and rmodel['total_nsamples'] > 0:
            total_nsamples = rmodel['total_nsamples']

        best_model = [unzip(tparams), unzip(optimizer_state), uidx_restore]

    train_start = time.clock()
    max_updates_per_epoch = total_nsamples / data_options['batch_size']

    try:
        for epoch in xrange(0, max_epochs):
            if total_nsamples > 0 and \
                    uidx + max_updates_per_epoch < uidx_restore[0]:
                uidx += max_updates_per_epoch
                continue

            n_samples = 0
            for x, x_mask, \
                    y, y_mask in train_stream.get_epoch_iterator():

                n_samples += len(x)

                uidx += 1
                if uidx < uidx_restore[0]:
                    continue

                x_length = x_mask.sum(1).mean()

                if model_options['label_type'] == 'binary':
                    old_y = y
                    y, y_mask = mul2bin(y, y_mask, model_options['n_bins'])

                y, y_mask = y.T, y_mask.T

                if data_options['input_token_level'] == 'character':
                    x, x_mask = prepare_character_tensor(x)
                else:
                    x, x_mask = x.T, x_mask.T

                unk_token_ratio = (x == 1).sum(0) / x_mask.sum(0)
                non_empty_insts = unk_token_ratio <= 0.5

                y = y[:, non_empty_insts]
                y_mask = y_mask[:, non_empty_insts]
                x = x[:, non_empty_insts]
                x_mask = x_mask[:, non_empty_insts]

                if x.shape[1] == 0:
                    continue

                encoder_inps = [x, x_mask]
                decoder_inps = [y, y_mask]

                inps = encoder_inps + decoder_inps

                use_noise.set_value(1.)

                log_entry = {'iteration': uidx, 'epoch': epoch}

                # compute cost, grads and copy grads to shared variables
                update_start = time.clock()
                cost = f_grad_shared(*inps)
                f_update(lrate)

                if verbose:
                    log_entry['cost'] = float(cost)
                    log_entry['average_source_length'] = \
                        float(x_length)
                    log_entry['average_target_length'] = \
                        float(y_mask.sum(0).mean())
                    log_entry['update_time'] = time.clock() - update_start
                    log_entry['train_time'] = time.clock() - train_start

                # check for bad numbers, usually we remove non-finite elements
                # and continue training - but not done here
                if not numpy.isfinite(cost):
                    LOGGER.error('NaN detected')
                    return 1., 1., 1.

                # validate model on validation set and early stop if necessary
                if numpy.mod(uidx, valid_freq) == 0:
                    use_noise.set_value(0.)
                    valid_errs = [
                        numpy.mean(pred_probs(f_, model_options, valid_stream))
                        for f_ in f_log_probs
                    ]

                    for f_, err_ in zip(f_log_probs, valid_errs):
                        log_entry['validation_%s' % f_.name] = float(err_)

                    valid_scores = do_validation(f_sample_inits,
                                                 f_sample_nexts, tparams,
                                                 max_sample_length, trng,
                                                 model_options, valid_stream)

                    for eval_type, score in valid_scores.items():
                        log_entry['validation_%s' % eval_type] = score

                    for f_, err_ in zip(f_log_probs, valid_errs):
                        if not numpy.isfinite(err_):
                            raise RuntimeError(('NaN detected in validation '
                                                'error of %s') % f_.name)
                    valid_err = numpy.array(valid_errs).sum()

                    if valid_err < best_valid_err:
                        best_valid_err = valid_err
                        best_model = [
                            unzip(tparams),
                            unzip(optimizer_state), [uidx]
                        ]

                # save the best model so far
                if numpy.mod(uidx, save_freq) == 0 and \
                        uidx > uidx_restore[0]:
                    LOGGER.info('Saving best model so far')

                    if best_model is not None:
                        params, opt_state, save_at_uidx = best_model
                    else:
                        params = unzip(tparams)
                        opt_state = unzip(optimizer_state)
                        save_at_uidx = [uidx]

                    # save params to exp_id.npz and symlink model.npz to it
                    params_and_state = merge(
                        params, opt_state, {'uidx': save_at_uidx},
                        {'best_valid_err': best_valid_err},
                        {'total_nsamples': total_nsamples})
                    save_params(params_and_state, best_filename)

                # generate some samples with the model and display them
                if sample_freq > 0 and numpy.mod(uidx, sample_freq) == 0:
                    # FIXME: random selection?
                    log_entry['samples'] = []

                    if data_options['input_token_level'] == 'character':
                        batch_size = x.shape[2]
                    else:
                        batch_size = x.shape[1]
                    for jj in xrange(numpy.minimum(5, batch_size)):
                        stats = [('source', ''), ('truth', ''), ('sample', ''),
                                 ('align_sample', '')]
                        log_entry['samples'].append(OrderedDict(stats))

                        if data_options['input_token_level'] == 'character':
                            sample_encoder_inps = [
                                x[:, :, jj][:, :, None], x_mask[:, :, jj][:, :,
                                                                          None]
                            ]
                        else:
                            sample_encoder_inps = [
                                x[:, jj][:, None], x_mask[:, jj][:, None]
                            ]

                        solutions = gen_sample(tparams,
                                               f_sample_inits,
                                               f_sample_nexts,
                                               sample_encoder_inps,
                                               model_options,
                                               trng=trng,
                                               k=12,
                                               max_label_len=max_sample_length,
                                               argmax=False)

                        sample = solutions['samples']
                        alignment = solutions['alignments']
                        score = solutions['scores']

                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                        alignment = alignment[score.argmin()]

                        if model_options['label_type'] == 'binary':
                            # print(y[0], y.shape, old_y.shape)
                            y = old_y.T
                            assert type(ss) == list
                            assert len(ss) == max_sample_length
                            new_ss = [
                                tidx for tidx, s in enumerate(ss) if s == 1
                            ]
                            # print(len(ss), numpy.sum(ss), new_ss)
                            ss = new_ss
                            assert type(ss) == list

                            if len(ss) == 0:
                                ss.append(0)

                            if ss[0] == 0:  # if the first token is <EOS>
                                ss = ss[1:] + ss[:1]

                        if data_options['input_token_level'] == 'character':
                            num_src_words = int(
                                (x_mask[:, :, jj].sum(0) > 0).sum())

                            num_chars, num_words, num_samples = x.shape
                            for widx in xrange(num_words):
                                if x_mask[:, widx, jj].sum() == 0:
                                    break
                                for cidx in xrange(num_chars):
                                    cc = x[cidx, widx, jj]
                                    if cc == 0:
                                        break
                                    if cc in worddicts_r[0]:
                                        token = worddicts_r[0][cc]
                                    else:
                                        token = UNK_TOKEN
                                    log_entry['samples'][-1]['source'] \
                                        += token
                                log_entry['samples'][-1]['source'] += \
                                    ' '
                        else:
                            num_src_words = int(x_mask[:, jj].sum())

                            num_words, num_samples = x.shape
                            for vv in x[:, jj]:
                                if vv == 0:
                                    break
                                if vv in worddicts_r[0]:
                                    token = worddicts_r[0][vv]
                                else:
                                    token = UNK_TOKEN
                                log_entry['samples'][-1]['source'] \
                                    += token + ' '

                        for vv in y[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                token = worddicts_r[1][vv]
                            else:
                                token = UNK_TOKEN
                            log_entry['samples'][-1]['truth'] += token + ' '

                        for tidx, vv in enumerate(ss):
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                token = worddicts_r[1][vv]
                            else:
                                token = UNK_TOKEN

                            assert tidx >= 0 and tidx < len(alignment), \
                                '%d\t%d' % (tidx, len(alignment))

                            align_src_word_idx = \
                                (alignment[tidx][
                                    :num_src_words-1]).argmax()
                            aligned_token = '%s_<%d>' % \
                                (token, align_src_word_idx)

                            log_entry['samples'][-1]['sample'] += token + ' '
                            log_entry['samples'][-1]['align_sample'] \
                                += aligned_token + ' '

                # finish after this many updates
                if uidx >= finish_after:
                    LOGGER.info('Finishing after {} iterations'.format(uidx))
                    estop = True
                    break

                if time_limit > 0 and \
                   (time.time() - start_time > time_limit * 60):

                    LOGGER.info(
                        'Time limit {} mins is over'.format(time_limit))
                    estop = True

                    break

                if verbose and len(log_entry) > 2:
                    log.log(log_entry)

            LOGGER.info('Completed epoch, seen {} samples'.format(n_samples))
            if total_nsamples == 0:
                total_nsamples = n_samples

            if estop:
                log.log(log_entry)
                break

        if best_model is not None:
            assert len(best_model) == 3
            best_p, best_state, best_uidx = best_model
            zipp(best_p, tparams)
            zipp(best_state, optimizer_state)
        '''
        use_noise.set_value(0.)
        LOGGER.info('Calculating validation cost')
        valid_errs = do_validation(f_log_probs, model_options, valid_stream)
        '''

        if not best_model:
            best_p = unzip(tparams)
            best_state = unzip(optimizer_state)
            best_uidx = [uidx]

        best_p = copy.copy(best_p)
        best_state = copy.copy(best_state)
        params_and_state = merge(best_p, best_state, {'uidx': best_uidx},
                                 {'best_valid_err': best_valid_err},
                                 {'total_nsamples': total_nsamples})
        save_params(params_and_state, best_filename)

    except Exception:
        LOGGER.error(traceback.format_exc())
        best_valid_err = -1.
    else:
        # XXX add something needed
        print('Training Done')

    return best_valid_err
Esempio n. 31
0
def train(dim_word=100,  # word vector dimensionality
          dim=1000,  # the number of LSTM units
          encoder='gru',
          decoder='gru_cond',
          n_words_src=30000,
          n_words=30000,
          patience=10,  # early stopping patience
          max_epochs=5000,
          finish_after=10000000,  # finish after this many updates
          dispFreq=100,
          decay_c=0.,  # L2 regularization penalty
          alpha_c=0.,  # alignment regularization
          clip_c=-1.,  # gradient clipping threshold
          lrate=1.,  # learning rate
          maxlen=100,  # maximum length of the description
          optimizer='rmsprop',
          batch_size=16,
          saveto='model.npz',
          saveFreq=1000,  # save the parameters after every saveFreq updates
          datasets=[
              '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
              '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'],
          picked_train_idxes_file=r'',
          use_dropout=False,
          reload_=False,
          overwrite=False,
          preload='',
          sort_by_len=False,
          convert_embedding=True,
          dump_before_train=False,
    ):
    # Model options
    model_options = locals().copy()
    if reload_:
        lrate *= 0.5

    # load dictionaries and invert them

    # reload options
    if reload_ and os.path.exists(preload):
        print 'Reloading model options'
        with open(r'.\model\en2fr.iter160000.npz.pkl', 'rb') as f:
            model_options = pkl.load(f)

    print 'Configuration from fy'

    vocab_en_filename = './data/dic/en2fr_en_vocabs_top1M.pkl'
    vocab_fr_filename = './data/dic/en2fr_fr_vocabs_top1M.pkl'
    map_filename = './data/dic/mapFullVocab2Top1MVocab.pkl'
    lr_discount_freq = 80000

    print 'Done'

    print 'Loading data'

    text_iterator = TextIterator(
        datasets[0],
        datasets[1],
        vocab_en_filename,
        vocab_fr_filename,
        batch_size,
        maxlen,
        n_words_src,
        n_words,
    )

    # sys.stdout.flush()
    # train_data_x = pkl.load(open(datasets[0], 'rb'))
    # train_data_y = pkl.load(open(datasets[1], 'rb'))
    #
    # if len(picked_train_idxes_file) != 0:
    #     picked_idxes = pkl.load(open(picked_train_idxes_file, 'rb'))
    #     train_data_x = [train_data_x[id] for id in picked_idxes]
    #     train_data_y = [train_data_y[id] for id in picked_idxes]
    #
    # print 'Total train:', len(train_data_x)
    # print 'Max len:', max([len(x) for x in train_data_x])
    # sys.stdout.flush()
    #
    # if sort_by_len:
    #     slen = np.array([len(s) for s in train_data_x])
    #     sidx = slen.argsort()
    #
    #     _sbuf = [train_data_x[i] for i in sidx]
    #     _tbuf = [train_data_y[i] for i in sidx]
    #
    #     train_data_x = _sbuf
    #     train_data_y = _tbuf
    #     print len(train_data_x[0]), len(train_data_x[-1])
    #     sys.stdout.flush()
    #     train_batch_idx = get_minibatches_idx(len(train_data_x), batch_size, shuffle=False)
    # else:
    #     train_batch_idx = get_minibatches_idx(len(train_data_x), batch_size, shuffle=True)

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(preload):
        print 'Reloading model parameters'
        params = load_params(preload, params)

        # for k, v in params.iteritems():
        #     print '>', k, v.shape, v.dtype

        # Only convert parameters when reloading
        if convert_embedding:
            # =================
            # Convert input and output embedding parameters with a exist word embedding
            # =================
            print 'Convert input and output embedding'

            temp_Wemb = params['Wemb']
            orig_emb_mean = np.mean(temp_Wemb, axis=0)

            params['Wemb'] = np.tile(orig_emb_mean, [params['Wemb'].shape[0], 1])

            # Load vocabulary map dicts and do mapping
            with open(map_filename, 'rb') as map_file:
                map_en = pkl.load(map_file)
                map_fr = pkl.load(map_file)

            for full, top in map_en.iteritems():
                emb_size = temp_Wemb.shape[0]
                if full < emb_size and top < emb_size:
                    params['Wemb'][top] = temp_Wemb[full]

            print 'Convert input embedding done'

            temp_ff_logit_W = params['ff_logit_W']
            temp_Wemb_dec = params['Wemb_dec']
            temp_b = params['ff_logit_b']

            orig_ff_logit_W_mean = np.mean(temp_ff_logit_W, axis=1)
            orig_Wemb_dec_mean = np.mean(temp_Wemb_dec, axis=0)
            orig_b_mean = np.mean(temp_b)

            params['ff_logit_W'] = np.tile(orig_ff_logit_W_mean, [params['ff_logit_W'].shape[1], 1]).T
            params['ff_logit_b'].fill(orig_b_mean)
            params['Wemb_dec'] = np.tile(orig_Wemb_dec_mean, [params['Wemb_dec'].shape[0], 1])

            for full, top in map_en.iteritems():
                emb_size = temp_Wemb.shape[0]
                if full < emb_size and top < emb_size:
                    params['ff_logit_W'][:, top] = temp_ff_logit_W[:, full]
                    params['ff_logit_b'][top] = temp_b[full]
                    params['Wemb_dec'][top] = temp_Wemb[full]

            print 'Convert output embedding done'

            # for k, v in params.iteritems():
            #     print '>', k, v.shape, v.dtype

            # ================
            # End Convert
            # ================

    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost, x_emb = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    print 'Building sampler'
    f_init, f_next = build_sampler(tparams, model_options, trng, use_noise)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    f_x_emb = theano.function([x, x_mask], x_emb, profile=profile)
    print 'Done'
    sys.stdout.flush()
    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(np.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(np.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0)) ** 2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'
    sys.stdout.flush()
    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g ** 2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (clip_c ** 2),
                                           g / tensor.sqrt(g2) * clip_c,
                                           g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print 'Done'

    print 'Optimization'

    best_p = None
    bad_counter = 0
    uidx = 0
    if reload_:
        m = re.search('.+iter(\d+?)\.npz', preload)
        if m:
            uidx = int(m.group(1))
    print 'uidx', uidx, 'l_rate', lrate

    estop = False
    history_errs = []
    # reload history

    if dump_before_train:
        print 'Dumping before train...',
        saveto_uidx = '{}.iter{}.npz'.format(
            os.path.splitext(saveto)[0], uidx)
        np.savez(saveto_uidx, history_errs=history_errs,
                 uidx=uidx, **unzip(tparams))
        print 'Done'

    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    for eidx in xrange(max_epochs):
        n_samples = 0

        # for i, batch_idx in train_batch_idx:
        #
        #     x = [train_data_x[id] for id in batch_idx]
        #     y = [train_data_y[id] for id in batch_idx]

        for i, (x, y) in enumerate(text_iterator):
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            x, x_mask, y, y_mask = prepare_data(x, y)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if np.isnan(cost) or np.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # discount reward
            if lr_discount_freq > 0 and np.mod(uidx, lr_discount_freq) == 0:
                lrate *= 0.5
                print 'Discount learning rate to {} at iteration {}'.format(lrate, uidx)

            # verbose
            if np.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
                sys.stdout.flush()

            if np.mod(uidx, saveFreq) == 0:
                # save with uidx
                if not overwrite:
                    # print 'Saving the model at iteration {}...'.format(uidx),
                    saveto_uidx = '{}.iter{}.npz'.format(
                            os.path.splitext(saveto)[0], uidx)
                    np.savez(saveto_uidx, history_errs=history_errs,
                             uidx=uidx, **unzip(tparams))
                    # print 'Done'
                    # sys.stdout.flush()
            # generate some samples with the model and display them

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)

    return 0.
Esempio n. 32
0
def train(random_seed=1234,
          dim_word=256, # word vector dimensionality
          ctx_dim=-1, # context vector dimensionality, auto set
          dim=1000, # the number of LSTM units
          n_layers_out=1,
          n_layers_init=1,
          encoder='none',
          encoder_dim=100,
          prev2out=False,
          ctx2out=False,
          patience=10,
          max_epochs=5000,
          dispFreq=100,
          decay_c=0.,
          alpha_c=0.,
          alpha_entropy_r=0.,
          lrate=0.01,
          selector=False,
          n_words=100000,
          maxlen=100, # maximum length of the description
          optimizer='adadelta',
          clip_c=2.,
          batch_size = 64,
          valid_batch_size = 64,
          save_model_dir='/data/lisatmp3/yaoli/exp/capgen_vid/attention/test/',
          validFreq=10,
          saveFreq=10, # save the parameters after every saveFreq updates
          sampleFreq=10, # generate some samples after every sampleFreq updates
          metric='blue',
          dataset='youtube2text',
          video_feature='googlenet',
          use_dropout=False,
          reload_=False,
          from_dir=None,
          K1=10,
          K2=10,
          OutOf=240,
          verbose=True,
          debug=True
          ):
    rng_numpy, rng_theano = utils.get_two_rngs()

    model_options = locals().copy()
    model_options_c = locals().copy()
    if 'self' in model_options:
        del model_options['self']
    with open('model_files/model_options.pkl', 'wb') as f:
        pkl.dump(model_options, f)
    with open('model_files/model_options_c3d.pkl', 'wb') as f:
        pkl.dump(model_options_c, f)

    # instance model
    layers = Layers()
    model = Model()
    model_c = Model()

    print 'Loading data'
    engine = data_engine.Movie2Caption('attention', dataset,
                                       video_feature,
                                       batch_size, valid_batch_size,
                                       maxlen, n_words,
                                       K1, K2, OutOf)
    model_options['ctx_dim'] = engine.ctx_dim
    model_options_c['ctx_dim'] = engine.ctx_dim_c
    model_options['n_words'] = engine.n_words
    model_options_c['n_words'] = engine.n_words
    print 'n_words:', model_options['n_words']
    print model_options_c['dim'],model_options_c['ctx_dim']

    # set test values, for debugging
    idx = engine.kf_train[0]
    [x_tv, mask_tv,
     ctx_tv, ctx_mask_tv,
     ctx_tv_c, ctx_mask_tv_c] = data_engine.prepare_data(
        engine, [engine.train[index] for index in idx])

    print 'init params'
    t0 = time.time()
    params = model.init_params(model_options)
    params_c = model_c.init_params(model_options_c)
    # reloading
    model_saved = 'model_files/model_resnet.npz'
    model_saved_c = 'model_files/model_c3d.npz'
    assert os.path.isfile(model_saved)
    print "Reloading model params..."
    params = utils.load_params(model_saved, params)
    params_c = utils.load_params(model_saved_c, params_c)

    tparams = utils.init_tparams(params)
    tparams_c = utils.init_tparams(params_c)

    trng, use_noise, \
          x, mask, ctx, mask_ctx, \
          cost, extra = \
          model.build_model(tparams, model_options)
    alphas = extra[1]
    betas = extra[2]

    trng_c, use_noise_c, \
    x_c, mask_c, ctx_c, mask_ctx_c, \
    cost_c, extra_c = \
        model_c.build_model(tparams_c, model_options_c)


    alphas_c = extra_c[1]
    betas_c = extra_c[2]

    print 'buliding sampler'
    f_init, f_next = model.build_sampler(tparams, model_options, use_noise, trng)
    f_init_c, f_next_c = model_c.build_sampler(tparams_c, model_options_c, use_noise_c, trng_c)
    # before any regularizer
    print 'building f_log_probs'
    f_log_probs = theano.function([x, mask, ctx, mask_ctx], -cost,
                                  profile=False, on_unused_input='ignore')
    f_log_probs_c = theano.function([x_c, mask_c, ctx_c, mask_ctx_c], -cost_c,
                                  profile=False, on_unused_input='ignore')

    bad_counter = 0

    processes = None
    queue = None
    rqueue = None
    shared_params = None

    uidx = 0
    uidx_best_blue = 0
    uidx_best_valid_err = 0
    estop = False
    best_p = utils.unzip(tparams)
    best_blue_valid = 0
    best_valid_err = 999
    alphas_ratio = []
    for eidx in xrange(max_epochs):
        n_samples = 0
        train_costs = []
        grads_record = []
        print 'Epoch ', eidx
        for idx in engine.kf_train:
            tags = [engine.train[index] for index in idx]
            n_samples += len(tags)
            use_noise.set_value(1.)

            pd_start = time.time()
            x, mask, ctx, ctx_mask, ctx_c, ctx_mask_c = data_engine.prepare_data(
                engine, tags)
            #print 'x:',x.shape,'ctx:',ctx.shape,'ctx_c:',ctx_c.shape
            pd_duration = time.time() - pd_start
            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                continue

            if numpy.mod(uidx, saveFreq) == 0:
                pass

            if numpy.mod(uidx, sampleFreq) == 0:
                use_noise.set_value(0.)
                print '------------- sampling from train ----------'
                x_s = x
                mask_s = mask
                ctx_s = ctx
                ctx_s_c = ctx_c
                ctx_mask_s = ctx_mask
                ctx_mask_s_c = ctx_mask_c
                model.sample_execute_ensemble(engine, model_options,model_options_c, tparams,tparams_c,
                                          f_init,f_init_c, f_next,f_next_c, x_s, ctx_s,
                                          ctx_mask_s, ctx_s_c, ctx_mask_s_c, trng)
                print '------------- sampling from valid ----------'
                idx = engine.kf_valid[numpy.random.randint(1, len(engine.kf_valid) - 1)]
                tags = [engine.valid[index] for index in idx]
                x_s, mask_s, ctx_s, mask_ctx_s, ctx_s_c,mask_ctx_s_c = data_engine.prepare_data(engine, tags)
                model.sample_execute_ensemble(engine, model_options,model_options_c, tparams,tparams_c,
                                          f_init, f_init_c, f_next, f_next_c, x_s, ctx_s,
                                     mask_ctx_s, ctx_s_c, mask_ctx_s_c, trng)

            if validFreq != -1 and numpy.mod(uidx, validFreq) == 0:
                current_params = utils.unzip(tparams)

                use_noise.set_value(0.)
                train_err = -1
                train_perp = -1
                valid_err = -1
                valid_perp = -1
                test_err = -1
                test_perp = -1

                mean_ranking = 0
                blue_t0 = time.time()
                scores, processes, queue, rqueue, shared_params = \
                    metrics.compute_score_ensemble(
                    model_type='attention',
                    model_archive=current_params,
                    options=model_options,
                    options_c=model_options_c,
                    engine=engine,
                    save_dir=save_model_dir,
                    beam=5, n_process=5,
                    whichset='both',
                    on_cpu=False,
                    processes=processes, queue=queue, rqueue=rqueue,
                    shared_params=shared_params, metric=metric,
                    one_time=False,
                    f_init=f_init, f_init_c=f_init_c, f_next=f_next, f_next_c= f_next_c, model=model
                    )
                '''
                 {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]},
                 'alternative_valid': {'Bleu_3': 0.40702270203174923,
                 'Bleu_4': 0.29276570520368456,
                 'CIDEr': 0.25247168210607884,
                 'Bleu_2': 0.529069629270047,
                 'Bleu_1': 0.6804308797115253,
                 'ROUGE_L': 0.51083584331688392},
                 'meteor': {'test': [-1], 'valid': [0.282787550236724]}}
                '''

                valid_B1 = scores['valid']['Bleu_1']
                valid_B2 = scores['valid']['Bleu_2']
                valid_B3 = scores['valid']['Bleu_3']
                valid_B4 = scores['valid']['Bleu_4']
                valid_Rouge = scores['valid']['ROUGE_L']
                valid_Cider = scores['valid']['CIDEr']
                valid_meteor = scores['valid']['METEOR']
                test_B1 = scores['test']['Bleu_1']
                test_B2 = scores['test']['Bleu_2']
                test_B3 = scores['test']['Bleu_3']
                test_B4 = scores['test']['Bleu_4']
                test_Rouge = scores['test']['ROUGE_L']
                test_Cider = scores['test']['CIDEr']
                test_meteor = scores['test']['METEOR']
                print 'computing meteor/blue score used %.4f sec, '\
                  'blue score: %.1f, meteor score: %.1f'%(
                time.time()-blue_t0, valid_B4, valid_meteor)


                if test_B4>0.52 and test_meteor>0.32:
                    print 'Saving to %s...'%save_model_dir,
                    numpy.savez(
                        save_model_dir+'model_'+str(uidx)+'.npz',
                         **current_params)

                print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \
                  'best valid err so far',best_valid_err
                print 'valid took %.2f sec'%(time.time() - t0_valid)
                # end of validatioin
                sys.exit()
            if debug:
                break
        if estop:
            break
        if debug:
            break

        # end for loop over minibatches
        print 'This epoch has seen %d samples, train cost %.2f'%(
            n_samples, numpy.mean(train_costs))
    # end for loop over epochs
    print 'Optimization ended.'
    if best_p is not None:
        utils.zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = 0
    test_err = 0
    if not debug:
        #if valid:
        valid_err, valid_perp = model.pred_probs(
            engine, 'valid', f_log_probs,
            verbose=model_options['verbose'])
        #if test:
        #test_err, test_perp = self.pred_probs(
        #    'test', f_log_probs,
        #    verbose=model_options['verbose'])


    print 'stopped at epoch %d, minibatch %d, '\
      'curent Train %.2f, current Valid %.2f, current Test %.2f '%(
          eidx,uidx,numpy.mean(train_err),numpy.mean(valid_err),numpy.mean(test_err))
    params = copy.copy(best_p)
    numpy.savez(save_model_dir+'model_best.npz',
                train_err=train_err,
                valid_err=valid_err, test_err=test_err, history_errs=history_errs,
                **params)

    if history_errs != []:
        history = numpy.asarray(history_errs)
        best_valid_idx = history[:,6].argmin()
        numpy.savetxt(save_model_dir+'train_valid_test.txt', history, fmt='%.4f')
        print 'final best exp ', history[best_valid_idx]

    return train_err, valid_err, test_err
Esempio n. 33
0
def train(options, data, load_params=False, start_epoc=0):
    print "OPTIONS: ", options
    print 'Setting up model with options:'
    options = set_defaults(options)
    for kk, vv in options.iteritems():
        print kk, vv
    print "model seed: ", options['model_seed']
    print "fold: ", options['fold']
    print 'seed: ', options['seed']
    rng = numpy.random.RandomState(options['model_seed'] +
                                   100 * options.get('fold', 99) +
                                   options.get('seed', 99))
    params, operators = init_params(options, rng)
    print 'done...'

    if load_params:
        loaded = load_par(options)
        start_epoc = resume_epoc(options)
        # Check that we've loaded the correct parameters...
        for kk, vv in loaded.iteritems():
            assert params[kk].shape == vv.shape
            assert type(params[kk]) == type(vv)
        params = loaded

    tparams = init_tparams(params)

    trng, use_noise, inps, out = build_model(tparams, options, rng)
    y = tensor.imatrix('y')
    cost = nll(out, y)

    f_eval = theano.function([inps, y],
                             cost,
                             givens={use_noise: numpy.float32(0.)},
                             on_unused_input='ignore')

    reg = 0.
    for k, v in tparams.iteritems():
        if k[:6] == 'hidden' or k[-3:] == 'W_h':
            reg += options['l1'] * tensor.sum(abs(v))
            reg += options['l2'] * tensor.sum((v)**2)

    cost += reg

    grads = tensor.grad(cost, wrt=itemlist(tparams))
    lr = tensor.scalar(name='lr', dtype=theano.config.floatX)
    opt = get_optim(options['opt'])
    print 'Compiling functions'
    f_grad_shared, f_update, gshared = opt(lr, tparams, grads, [inps, y], cost,
                                           use_noise)
    f_out = theano.function([inps],
                            out,
                            givens={use_noise: numpy.float32(0.)},
                            on_unused_input='ignore',
                            allow_input_downcast=True)

    best = numpy.inf
    print 'Starting training'

    train = list_update(data[0], f_eval, options['batch_size'], rng=rng)
    test = list_update(data[-1], f_eval, options['batch_size'], rng=rng)
    starting = (train, test)
    print 'Pre-training. test: %f, train: %f' % (test, train)
    print 'Training'
    lr = options['lr']
    max_itr = options['max_itr']
    grad_norm = 0.
    train_scores = 50 * [0.]
    try:
        for epoch in xrange(max_itr):
            start_time = time.time()
            for g in gshared:
                # manually set gradients to 0 because we accumulate in list update
                g.set_value(0.0 * g.get_value())
            use_noise.set_value(1.)
            train_cost, n_obs = list_update(data[0],
                                            f_grad_shared,
                                            batchsize=options['batch_size'],
                                            rng=rng,
                                            return_n_obs=True)
            use_noise.set_value(0.)
            for g in gshared:
                g.set_value(floatx(g.get_value() / float(n_obs)))
            f_update(lr)
            apply_proximity(tparams, operators)
            train = list_update(data[0],
                                f_eval,
                                options['batch_size'],
                                rng=rng)
            elapsed_time = time.time() - start_time

            if train < best:
                # early stopping on training set
                test = list_update(data[-1], f_eval)
                best_par = unzip(tparams)
                best_perf = (train, test)
                best = train

            test = list_update(data[-1], f_eval)

            if (epoch % 50) == 0:
                # Save progress....
                save_progress(options, tparams, epoch, best_perf)
                print 'Epoch: %d, cost: %f, train: %f, test: %f, lr:%f, time: %f' % (
                    epoch, train_cost, train, test, lr, elapsed_time)

            # Check if we're diverging...
            train_ave = running_ave(train_scores, train, epoch)

            if epoch > 1000:
                # Only exit if we're diverging after 1000 iterations
                if train_ave > 1.03 * best_perf[0]:
                    print "Diverged..."
                    break
    except KeyboardInterrupt:
        print "Interrupted"
    # check that we're outputing prob distributions
    X = data[0][(3, 3)][0]
    assert abs(
        f_out(X.reshape(X.shape[0], 2, 3, 3)).sum() - float(X.shape[0])) < 1e-4
    print "Best performance:"
    print "train, test"
    print "%f,%f" % best_perf
    return best_perf, best_par
Esempio n. 34
0
def generate(model_options_file='model_options.pkl',
             model_file='model_best_so_far.npz'):
    from_dir = 'model_files/'
    print 'preparing reload'
    model_options = utils.load_pkl(from_dir + model_options_file)

    print 'Loading data'
    engine = data_engine.Movie2Caption(
        'attention', model_options['dataset'], model_options['video_feature'],
        model_options['batch_size'], model_options['valid_batch_size'],
        model_options['maxlen'], model_options['n_words'], model_options['K'],
        model_options['OutOf'])

    feat = numpy.load('datas/vid1715.npy')
    ctx = engine.get_sub_frames(feat)
    ctx_mask = engine.get_ctx_mask(ctx)

    print 'init params'
    t0 = time.time()
    model = Model()
    params = model.init_params(model_options)

    model_saved = from_dir + model_file
    assert os.path.isfile(model_saved)
    print "Reloading model params..."
    params = utils.load_params(model_saved, params)
    tparams = utils.init_tparams(params)
    print tparams.keys

    print 'buliding sampler'
    use_noise = theano.shared(numpy.float32(0.))
    use_noise.set_value(0.)
    trng = RandomStreams(1234)
    f_init, f_next = model.build_sampler(tparams, model_options, use_noise,
                                         trng)

    print 'start generate...'
    g_t0 = time.time()
    sample, sample_score, _, _ = model.gen_sample(
        None,
        f_init,
        f_next,
        ctx,
        ctx_mask,
        model_options,
        None,
        5,
        maxlen=model_options['maxlen'])
    print sample
    # best_one = numpy.argmin(sample_score)
    # sample = sample[best_one]
    for s in sample:
        for kk, ss in enumerate([s]):
            for vv in ss:
                if vv == 0:
                    break
                if vv in engine.word_idict:
                    print engine.word_idict[vv],
                else:
                    print 'UNK',
        print
Esempio n. 35
0
def train_model(
    dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
    patience=10,  # Number of epoch to wait before early stop if no progress
    max_epochs=100,  # The maximum number of epoch to run
    dispFreq=500,  # Display to stdout the training progress every N updates
    decay_c=0.,  # Weight decay for the classifier applied to the U weights.
    lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
    n_words=10000,  # Vocabulary size
    optimizer='adadelta',
    encoder='lstm',
    rnnshare=True,
    bidir=False,
    saveto=None,  # The best model will be saved there
    validFreq=370,  # Compute the validation error after this number of update.
    saveFreq=1110,  # Save the parameters after every saveFreq updates
    maxlen=100,  # Sequence longer then this get ignored
    batch_size=16,  # The batch size during training.
    valid_batch_size=64,  # The batch size used for validation/test set.
    dataset='imdb',
    W=None,  # embeddings
    deep=0,  # number of layers above
    rnnlayer=0,  # number of rnn layers
    filter_hs=[3, 4, 5],  #filter's width
    feature_maps=100,  #number of filters
    pool_type='max',  #pooling type
    combine=False,
    init='uniform',
    salstm=False,
    noise_std=0.,
    dropout_penul=0.5,
    reload_model=None,  # Path to a saved model we want to start from.
    data_loader=None,
    fname='',
):

    # Model options
    optimizer = optims[optimizer]
    model_options = locals().copy()
    #print "model options", model_options

    # Load data
    (load_data, prepare_data) = data_loader

    print 'Loading', dataset, 'data'
    train, valid, test = load_data()

    ydim = numpy.max(train[1]) + 1
    model_options['ydim'] = ydim

    print 'Building model'
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if reload_model:
        load_params(reload_model, params)

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, mask, y, f_pred_prob, f_pred,
     cost) = build_model(tparams, model_options, SEED)

    if decay_c > 0.:
        decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
        weight_decay = 0.

        weight_decay += (tparams['U']**2).sum()

        if model_options['encoder'] == 'lstm':
            for layer in range(model_options['deep']):
                weight_decay += (tparams['U' + str(layer + 1)]**2).sum()
        elif model_options['encoder'] == 'cnnlstm':
            for filter_h in model_options['filter_hs']:
                weight_decay += (tparams['cnn_f' + str(filter_h)]**2).sum()

        weight_decay *= decay_c
        cost += weight_decay

    f_cost = theano.function([x, mask, y], cost, name='f_cost')

    grads = tensor.grad(cost, wrt=tparams.values())
    f_grad = theano.function([x, mask, y], grads, name='f_grad')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost)

    print 'Optimization'

    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    print "%d train examples" % len(train[0])
    print "%d valid examples" % len(valid[0])
    print "%d test examples" % len(test[0])

    history_errs = []
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.time()
    try:
        for eidx in xrange(max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(1.)

                # Select the random examples for this minibatch
                y = [train[1][t] for t in train_index]
                x = [train[0][t] for t in train_index]

                # Get the data in numpy.ndarray format
                # This swap the axis!
                # Return something of shape (minibatch maxlen, n samples)
                x, mask, y = prepare_data(x, y)
                n_samples += x.shape[1]

                cost = f_grad_shared(x, mask, y)
                f_update(lrate)

                if numpy.isnan(cost) or numpy.isinf(cost):
                    print 'NaN detected'
                    return 1., 1., 1.

                if numpy.mod(uidx, dispFreq) == 0:
                    print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost

                if saveto and numpy.mod(uidx, saveFreq) == 0:
                    print 'Saving...',

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    numpy.savez(saveto, history_errs=history_errs, **params)
                    pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                    print 'Done'

                if numpy.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.)
                    train_err = pred_error(f_pred, prepare_data, train, kf)
                    valid_err = pred_error(f_pred, prepare_data, valid,
                                           kf_valid)
                    test_err = pred_error(f_pred, prepare_data, test, kf_test)
                    history_errs.append([valid_err, test_err])

                    if (uidx == 0 or valid_err <=
                            numpy.array(history_errs)[:, 0].min()):

                        best_p = unzip(tparams)
                        bad_counter = 0

                    print('Train ', train_err, 'Valid ', valid_err, 'Test ',
                          test_err)

                    if ((len(history_errs) > patience and valid_err >=
                         numpy.array(history_errs)[:-patience, 0].min())):
                        bad_counter += 1

                        if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

            print 'Seen %d samples' % n_samples

            if estop:
                break

    except KeyboardInterrupt:
        print "Training interupted"

    end_time = time.time()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    use_noise.set_value(0.)
    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
    train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted)
    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
    test_err = pred_error(f_pred,
                          prepare_data,
                          test,
                          kf_test,
                          fname=fname,
                          verbose=True)

    print 'Train Error', train_err, 'Valid Error', valid_err, 'Test Error', test_err
    if saveto:
        numpy.savez(saveto,
                    train_err=train_err,
                    valid_err=valid_err,
                    test_err=test_err,
                    history_errs=history_errs,
                    **best_p)
    print 'The code run for %d epochs, with %f sec/epochs' % (
        (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
    print >> sys.stderr, ('Training took %.1fs' % (end_time - start_time))
    return train_err, valid_err, test_err
Esempio n. 36
0
    def __init__(self, num_hidden, num_features, seq_length, mb_size,
                 tf_states, rf_states):

        tf_states = T.specify_shape(tf_states,
                                    (seq_length, mb_size, num_features))
        rf_states = T.specify_shape(rf_states,
                                    (seq_length, mb_size, num_features))

        hidden_state_features = T.specify_shape(
            T.concatenate([tf_states, rf_states], axis=1),
            (seq_length, mb_size * 2, num_features))

        gru_params_1 = init_tparams(
            param_init_gru(None, {},
                           prefix="gru1",
                           dim=num_hidden,
                           nin=num_features))
        #gru_params_2 = init_tparams(param_init_gru(None, {}, prefix = "gru2", dim = num_hidden, nin = num_hidden + num_features))
        #gru_params_3 = init_tparams(param_init_gru(None, {}, prefix = "gru3", dim = num_hidden, nin = num_hidden + num_features))

        gru_1_out = gru_layer(gru_params_1,
                              hidden_state_features,
                              None,
                              prefix='gru1')[0]
        #gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis = 2), None, prefix = 'gru2', backwards = True)[0]
        #gru_3_out = gru_layer(gru_params_3, T.concatenate([gru_2_out, hidden_state_features], axis = 2), None, prefix = 'gru3')[0]

        final_out_recc = T.specify_shape(T.mean(gru_1_out, axis=0),
                                         (mb_size * 2, num_hidden))

        h_out_1 = DenseLayer((mb_size * 2, num_hidden),
                             num_units=num_hidden,
                             nonlinearity=lasagne.nonlinearities.rectify)
        #h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        #h_out_3 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        h_out_4 = DenseLayer((mb_size * 2, num_hidden),
                             num_units=1,
                             nonlinearity=None)

        h_out_1_value = h_out_1.get_output_for(final_out_recc)
        h_out_4_value = h_out_4.get_output_for(h_out_1_value)

        raw_y = h_out_4_value
        #raw_y = T.clip(h_out_4_value, -10.0, 10.0)
        classification = T.nnet.sigmoid(raw_y)

        #tf comes before rf.
        p_real = classification[:mb_size]
        p_gen = classification[mb_size:]

        #bce = lambda r,t: t * T.nnet.softplus(-r) + (1 - t) * (r + T.nnet.softplus(-r))

        self.d_cost_real = bce(p_real, 0.9 * T.ones(p_real.shape)).mean()
        self.d_cost_gen = bce(p_gen, 0.1 + T.zeros(p_gen.shape)).mean()
        self.g_cost_d = bce(p_gen, 0.9 * T.ones(p_gen.shape)).mean()
        self.d_cost = self.d_cost_real + self.d_cost_gen
        self.g_cost = self.g_cost_d

        self.classification = classification

        self.params = []
        self.params += lasagne.layers.get_all_params(h_out_4, trainable=True)
        #self.params += lasagne.layers.get_all_params(h_out_3,trainable=True)
        #self.params += lasagne.layers.get_all_params(h_out_2,trainable=True)
        self.params += lasagne.layers.get_all_params(h_out_1, trainable=True)

        self.params += gru_params_1.values()
        #self.params += gru_params_2.values()
        #self.params += gru_params_3.values()

        self.accuracy = T.mean(
            T.eq(T.ones(p_real.shape).flatten(),
                 T.gt(p_real, 0.5).flatten())) + T.mean(
                     T.eq(
                         T.ones(p_gen.shape).flatten(),
                         T.lt(p_gen, 0.5).flatten()))
Esempio n. 37
0
def train():
    if prm.optimizer.lower() == 'adam':
        optimizer = adam
    elif prm.optimizer.lower() == 'sgd':
        optimizer = sgd
    elif prm.optimizer.lower() == 'rmsprop':
        optimizer = rmsprop
    elif prm.optimizer.lower() == 'adadelta':
        optimizer = adadelta

    options = locals().copy()

    print 'parameters:', str(options)

    prm_k = vars(prm).keys()
    prm_d = vars(prm)
    prm_k.sort()

    for x in prm_k:
        if not x.startswith('__'):
            print x, '=', prm_d[x]

    print 'loading Vocabulary...'
    vocab = utils.load_vocab(prm.vocab_path, prm.n_words)
    options['vocab'] = vocab

    options['vocabinv'] = {}
    for k, v in vocab.items():
        options['vocabinv'][v] = k

    print options

    print 'Loading Environment...'
    if prm.engine.lower() == 'lucene':
        import lucene_search
        options['engine'] = lucene_search.LuceneSearch()
    elif prm.engine.lower() == 'elastic':
        import elastic_search
        options['engine'] = elastic_search.ElasticSearch()

    print 'Loading Dataset...'
    dh5 = dataset_hdf5.DatasetHDF5(prm.dataset_path)

    qi_train = dh5.get_queries(dset='train')
    dt_train = dh5.get_doc_ids(dset='train')
    qi_valid = dh5.get_queries(dset='valid')
    dt_valid = dh5.get_doc_ids(dset='valid')
    qi_test = dh5.get_queries(dset='test')
    dt_test = dh5.get_doc_ids(dset='test')

    if prm.train_size == -1:
        train_size = len(qi_train)
    else:
        train_size = min(prm.train_size, len(qi_train))

    if prm.valid_size == -1:
        valid_size = len(qi_valid)
    else:
        valid_size = min(prm.valid_size, len(qi_valid))

    if prm.test_size == -1:
        test_size = len(qi_test)
    else:
        test_size = min(prm.test_size, len(qi_test))

    print '%d train examples' % len(qi_train)
    print '%d valid examples' % len(qi_valid)
    print '%d test examples' % len(qi_test)

    # This create the initial parameters as np ndarrays.
    # Dict name (string) -> np ndarray
    params, exclude_params = utils.init_params(options)

    if prm.wordemb_path:
        print 'loading pre-trained word embeddings'
        params = utils.load_wemb(params, vocab)
        options['W'] = params['W']

    if prm.reload_model:
        utils.load_params(prm.reload_model, params)

    print 'Building model'
    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = utils.init_tparams(params)
    for kk, value in tparams.iteritems():
        tparams[kk] = theano.shared(value, name=kk)

    iin, out, updates, f_pred, consider_constant \
        = build_model(tparams, options)

    # get only parameters that are not in the exclude_params list
    tparams_ = OrderedDict([(kk, vv) for kk, vv in tparams.iteritems()
                            if kk not in exclude_params])

    grads = tensor.grad(out[0],
                        wrt=utils.itemlist(tparams_),
                        consider_constant=consider_constant)

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams_, grads, iin, out, updates)

    history_errs = []
    best_p = None

    if prm.validFreq == -1:
        validFreq = len(qi_train) / prm.batch_size_train
    else:
        validFreq = prm.validFreq

    if prm.saveFreq == -1:
        saveFreq = len(qi_train) / prm.batch_size_train
    else:
        saveFreq = prm.saveFreq

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.time()

    print 'Optimization'

    try:
        for eidx in xrange(prm.max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = utils.get_minibatches_idx(len(qi_train),
                                           prm.batch_size_train,
                                           shuffle=True)

            for _, train_index in kf:
                st = time.time()

                uidx += 1
                qi, qi_i, qi_lst, D_gt_id, D_gt_url = get_samples(
                    qi_train, dt_train, train_index, options)

                # share the current queries with the search engine.
                options['current_queries'] = qi_lst

                n_samples += len(qi)

                is_train = 1.
                out = f_grad_shared(qi_i, D_gt_id, is_train)

                cost = out.pop(0)
                cost_ent = out.pop(0)

                lr_t = f_update(prm.lrate)

                if np.isnan(cost) or np.isinf(cost):
                    print 'NaN detected'
                    return 1., 1., 1.

                print "options['reformulated_queries']", options[
                    'reformulated_queries']

                if np.mod(uidx, prm.dispFreq) == 0:

                    print '\n================================================================================'
                    print 'Epoch', eidx, 'Update', uidx, 'Cost', cost, 'LR_t', lr_t
                    print 'Time Minibatch Update: ' + str(time.time() - st)
                    print 'Input Query:       ', qi[0].replace('\n', '\\n')
                    print
                    print 'Target Docs:       ', str(D_gt_url[0])
                    print
                    print 'Input Query Vocab: ', utils.idx2text(
                        qi_i[0], options['vocabinv'])
                    for ii in range(prm.n_iterations):
                        prob = out.pop(0)
                        ans = out.pop(0)
                        metrics = out.pop(0)
                        bl = out.pop(0)
                        cost_bl = out.pop(0)
                        D_id = out.pop(0)

                        print "prob", prob
                        print "ans", ans
                        print "bl", bl
                        print "cost_bl", cost_bl
                        print "D_id", D_id

                        print("current_queries",
                              len(options['current_queries']),
                              options['current_queries'])
                        print
                        print 'Iteration', ii
                        print 'Baseline Value', bl.mean(), 'Cost', cost_bl
                        print '  '.join(prm.metrics_map.keys())
                        print metrics.mean(0)
                        print
                        i = 7
                        print 'Retrieved Docs:    ', str([
                            options['engine'].id_title_map[d_id]
                            for d_id in D_id[i]
                        ])
                        print
                        print 'Reformulated Query:', options[
                            'reformulated_queries'][ii][i]
                        print 'Current queries:', options['current_queries'][i]
                        print
                        print 'Query ANS:         ',
                        for kk, word in enumerate(
                                options['current_queries'][i][:ans.shape[1]]):
                            print "kk, word", kk, word
                            if word not in options['vocab'] and word != '':
                                word += '<unk>'
                            if ans[0, kk] == 1:
                                word = word.upper()
                            print str(word),
                        print
                        print
                        print 'prob[:,:,0].max(1).mean(), prob[:,:,0].mean(), prob[:,:,0].min(1).mean()', prob[:, :, 0].max(
                            1).mean(), prob[:, :,
                                            0].mean(), prob[:, :,
                                                            0].min(1).mean()
                        print 'prob[:,:,1].max(1).mean(), prob[:,:,1].mean(), prob[:,:,1].min(1).mean()', prob[:, :, 1].max(
                            1).mean(), prob[:, :,
                                            1].mean(), prob[:, :,
                                                            1].min(1).mean()
                    print '==================================================================================\n'

                if np.mod(uidx, validFreq) == 0 or uidx == 1:

                    kf_train = utils.get_minibatches_idx(
                        len(qi_train),
                        prm.batch_size_pred,
                        shuffle=True,
                        max_samples=train_size)
                    kf_valid = utils.get_minibatches_idx(
                        len(qi_valid),
                        prm.batch_size_pred,
                        shuffle=True,
                        max_samples=valid_size)
                    kf_test = utils.get_minibatches_idx(len(qi_test),
                                                        prm.batch_size_pred,
                                                        shuffle=True,
                                                        max_samples=test_size)

                    print '\nEvaluating - Training Set'
                    train_metrics = pred_error(f_pred, qi_train, dt_train,
                                               options, kf_train)
                    exit()

                    print '\nEvaluating - Validation Set'
                    valid_metrics = pred_error(f_pred, qi_valid, dt_valid,
                                               options, kf_valid)

                    print '\nEvaluating - Test Set'
                    test_metrics = pred_error(f_pred, qi_test, dt_test,
                                              options, kf_test)

                    his = [train_metrics, valid_metrics, test_metrics]
                    history_errs.append(his)
                    metric_idx = prm.metrics_map[prm.reward.upper()]
                    if (uidx == 0 or valid_metrics[-1, metric_idx] >=
                            np.array(history_errs)[:, 1, -1,
                                                   metric_idx].max()):
                        best_p = utils.unzip(tparams)
                        bad_counter = 0

                    print '====================================================================================================='
                    print '  '.join(prm.metrics_map.keys())
                    print
                    print 'Train:'
                    print train_metrics
                    print
                    print 'Valid:'
                    print valid_metrics
                    print
                    print 'Test:'
                    print test_metrics
                    print
                    print '====================================================================================================='
                    if (len(history_errs) > prm.patience
                            and valid_metrics[-1, metric_idx] <=
                            np.array(history_errs)[:-prm.patience, 1, -1,
                                                   metric_idx].max()):
                        bad_counter += 1
                        if bad_counter > prm.patience:
                            print 'Early Stop!'
                            estop = True
                            break

                if prm.saveto and np.mod(uidx, saveFreq) == 0:
                    print 'Saving...',

                    if best_p is not None:
                        params = best_p
                    else:
                        params = utils.unzip(tparams)
                    np.savez(prm.saveto, history_errs=history_errs, **params)

                    print 'Done'

            print 'Seen %d samples' % n_samples

            if estop:
                break

    except KeyboardInterrupt:
        print "Training interupted"
    return
Esempio n. 38
0
def translate_model(exit_event, queue, rqueue, pid, i2w_trg, model, options, k,
                    normalize, unk_replace):

    from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
    trng = RandomStreams(1234)
    # use_noise = theano.shared(numpy.float32(0.))

    # allocate model parameters
    params = init_params(options)

    # load model parameters and set theano shared variables
    params = load_params(model, params)
    tparams = init_tparams(params)

    # word index
    f_init, f_next = build_sampler(tparams, options, trng)

    def _seq2words(seq, word_idict_trg):
        words = []
        for w in seq:
            if w == 0:
                break
            words.append(word_idict_trg[w])
        return words

    def _translate(seq):
        seq = numpy.array(seq).reshape([len(seq), 1])

        # sample given an input sequence and obtain scores
        samples, alignments, scores = gen_sample(tparams,
                                                 f_init,
                                                 f_next,
                                                 seq,
                                                 options,
                                                 trng=trng,
                                                 k=k,
                                                 maxlen=100,
                                                 stochastic=False,
                                                 argmax=False)

        # normalize scores according to sequence lengths
        if normalize:
            lengths = numpy.array([len(s) for s in samples])
            scores = scores / lengths
        sidx = numpy.argmin(scores)
        return samples[sidx], alignments[sidx]

    def _replace_unk(trans_words, src_words, alignment):
        for idx, word in enumerate(trans_words):
            if word == UNK_TOKEN:
                # pick a source word
                # with which the target word is strongly aligned
                # except for the EOS token
                trans_words[idx] = src_words[alignment[idx][:-1].argmax()]

        return trans_words

    while (not exit_event.is_set()) and (not queue.empty()):
        req = queue.get()
        if req is None:
            break

        # idx: sentence index
        # x: a sequence of word indices
        # src_words: original source sentence
        idx, x, src_words = req[0], req[1], req[2]
        seq, alignment = _translate(x)

        assert len(seq) == len(alignment)

        # indices to tokens
        trans_words = _seq2words(seq, i2w_trg)

        if unk_replace:
            # unknown word replacement
            trans_words = _replace_unk(trans_words, src_words, alignment)

        # list of words to a single string
        trans_words = ' '.join(trans_words)

        # put the result
        rqueue.put((idx, trans_words))

    # write to release resources if needed

    return
def train(dim_word_desc=400,# word vector dimensionality
          dim_word_q=400,
          dim_word_ans=600,
          dim_proj=300,
          dim=400,# the number of LSTM units
          encoder_desc='lstm',
          encoder_desc_word='lstm',
          encoder_desc_sent='lstm',
          use_dq_sims=False,
          eyem=None,
          learn_h0=False,
          use_desc_skip_c_g=False,
          debug=False,
          encoder_q='lstm',
          patience=10,
          max_epochs=5000,
          dispFreq=100,
          decay_c=0.,
          alpha_c=0.,
          clip_c=-1.,
          lrate=0.01,
          n_words_q=49145,
          n_words_desc=115425,
          n_words_ans=409,
          pkl_train_files=None,
          pkl_valid_files=None,
          maxlen=2000, # maximum length of the description
          optimizer='rmsprop',
          batch_size=2,
          vocab=None,
          valid_batch_size=16,
          use_elu_g=False,
          saveto='model.npz',
          model_dir=None,
          ms_nlayers=3,
          validFreq=1000,
          saveFreq=1000, # save the parameters after every saveFreq updates
          datasets=[None],
          truncate=400,
          momentum=0.9,
          use_bidir=False,
          cost_mask=None,
          valid_datasets=['/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5',
                          '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5'],
          dropout_rate=0.5,
          use_dropout=True,
          reload_=True,
          **opt_ds):

    ensure_dir_exists(model_dir)
    mpath = os.path.join(model_dir, saveto)
    mpath_best = os.path.join(model_dir, prfx("best", saveto))
    mpath_last = os.path.join(model_dir, prfx("last", saveto))
    mpath_stats = os.path.join(model_dir, prfx("stats", saveto))

    # Model options
    model_options = locals().copy()
    model_options['use_sent_reps'] = opt_ds['use_sent_reps']
    stats = defaultdict(list)

    del model_options['eyem']
    del model_options['cost_mask']

    if cost_mask is not None:
        cost_mask = sharedX(cost_mask)

    # reload options and parameters
    if reload_:
        print "Reloading the model."
        if os.path.exists(mpath_best):
            print "Reloading the best model from %s." % mpath_best
            with open(os.path.join(mpath_best, '%s.pkl' % mpath_best), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath_best, params)
        elif os.path.exists(mpath):
            print "Reloading the model from %s." % mpath
            with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath, params)
        else:
            raise IOError("Couldn't open the file.")
    else:
        print "Couldn't reload the models initializing from scratch."
        params = init_params(model_options)

    if datasets[0]:
        print "Short dataset", datasets[0]

    print 'Loading data'
    print 'Building model'
    if pkl_train_files is None or pkl_valid_files is None:
        train, valid, test = load_data(path=datasets[0],
                                       valid_path=valid_datasets[0],
                                       test_path=valid_datasets[1],
                                       batch_size=batch_size,
                                       **opt_ds)
    else:
        train, valid, test = load_pkl_data(train_file_paths=pkl_train_files,
                                           valid_file_paths=pkl_valid_files,
                                           batch_size=batch_size,
                                           vocab=vocab,
                                           eyem=eyem,
                                           **opt_ds)

    tparams = init_tparams(params)
    trng, use_noise, inps_d, \
                     opt_ret, \
                     cost, errors, ent_errors, ent_derrors, probs = \
                        build_model(tparams,
                                    model_options,
                                    prepare_data if not opt_ds['use_sent_reps'] \
                                            else prepare_data_sents,
                                    valid,
                                    cost_mask=cost_mask)

    alphas = opt_ret['dec_alphas']

    if opt_ds['use_sent_reps']:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'],
                inps_d['slen'], inps_d['qlen'],\
                inps_d['ent_mask']
                ]
    else:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'], \
                inps_d['qlen'], \
                inps_d['ent_mask']]

    outs = [cost, errors, probs, alphas]
    if ent_errors:
        outs += [ent_errors]

    if ent_derrors:
        outs += [ent_derrors]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, outs, profile=profile)
    print 'Done'

    # Apply weight decay on the feed-forward connections
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.

        for kk, vv in tparams.iteritems():
            if "logit" in kk or "ff" in kk:
                weight_decay += (vv ** 2).sum()

        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print 'Computing gradient...',
    grads = safe_grad(cost, itemlist(tparams))
    print 'Done'

    # Gradient clipping:
    if clip_c > 0.:
        g2 = get_norms(grads)
        for p, g in grads.iteritems():
            grads[p] = tensor.switch(g2 > (clip_c**2),
                                     (g / tensor.sqrt(g2 + 1e-8)) * clip_c,
                                     g)
    inps.pop()
    if optimizer.lower() == "adasecant":
        learning_rule = Adasecant(delta_clip=25.0,
                                  use_adagrad=True,
                                  grad_clip=0.25,
                                  gamma_clip=0.)
    elif optimizer.lower() == "rmsprop":
        learning_rule = RMSPropMomentum(init_momentum=momentum)
    elif optimizer.lower() == "adam":
        learning_rule = Adam()
    elif optimizer.lower() == "adadelta":
        learning_rule = AdaDelta()

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    learning_rule = None

    if learning_rule:
        f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr,
                                                          grads=grads,
                                                          inp=inps,
                                                          cost=cost,
                                                          errors=errors)
    else:
        f_grad_shared, f_update = eval(optimizer)(lr,
                                                  tparams,
                                                  grads,
                                                  inps,
                                                  cost,
                                                  errors)

    print 'Done'
    print 'Optimization'
    history_errs = []
    # reload history
    if reload_ and os.path.exists(mpath):
        history_errs = list(numpy.load(mpath)['history_errs'])

    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size

    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    best_found = False
    uidx = 0
    estop = False

    train_cost_ave, train_err_ave, \
            train_gnorm_ave = reset_train_vals()

    for eidx in xrange(max_epochs):
        n_samples = 0

        if train.done:
            train.reset()

        for d_, q_, a, em in train:
            n_samples += len(a)
            uidx += 1
            use_noise.set_value(1.)

            if opt_ds['use_sent_reps']:
                # To mask the description and the question.
                d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents(d_,
                                                                            q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(d,
                                                           d_mask,
                                                           q,
                                                           q_mask,
                                                           a,
                                                           dlen,
                                                           slen,
                                                           qlen)
            else:
                d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(d, d_mask,
                                                           q, q_mask,
                                                           a,
                                                           dlen,
                                                           qlen)

            upnorm = f_update(lrate)
            ud = time.time() - ud_start

            # Collect the running ave train stats.
            train_cost_ave = running_ave(train_cost_ave,
                                         cost)
            train_err_ave = running_ave(train_err_ave,
                                        errors)
            train_gnorm_ave = running_ave(train_gnorm_ave,
                                          gnorm)

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                import ipdb; ipdb.set_trace()

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, ' Update ', uidx, \
                        ' Cost ', cost, ' UD ', ud, \
                        ' UpNorm ', upnorm[0].tolist(), \
                        ' GNorm ', gnorm, \
                        ' Pnorm ', pnorm, 'Terrors ', errors

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',
                if best_p is not None and best_found:
                    numpy.savez(mpath_best, history_errs=history_errs, **best_p)
                    pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb'))
                else:
                    params = unzip(tparams)

                numpy.savez(mpath, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % mpath, 'wb'))
                pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb'))

                print 'Done'
                print_param_norms(tparams)

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                if valid.done:
                    valid.reset()

                valid_costs, valid_errs, valid_probs, \
                        valid_alphas, error_ent, error_dent = eval_model(f_log_probs,
                                                  prepare_data if not opt_ds['use_sent_reps'] \
                                                    else prepare_data_sents,
                                                  model_options,
                                                  valid,
                                                  use_sent_rep=opt_ds['use_sent_reps'])

                valid_alphas_ = numpy.concatenate([va.argmax(0) for va  in valid_alphas.tolist()], axis=0)
                valid_err = valid_errs.mean()
                valid_cost = valid_costs.mean()
                valid_alpha_ent = -negentropy(valid_alphas)

                mean_valid_alphas = valid_alphas_.mean()
                std_valid_alphas = valid_alphas_.std()

                mean_valid_probs = valid_probs.argmax(1).mean()
                std_valid_probs = valid_probs.argmax(1).std()

                history_errs.append([valid_cost, valid_err])

                stats['train_err_ave'].append(train_err_ave)
                stats['train_cost_ave'].append(train_cost_ave)
                stats['train_gnorm_ave'].append(train_gnorm_ave)

                stats['valid_errs'].append(valid_err)
                stats['valid_costs'].append(valid_cost)
                stats['valid_err_ent'].append(error_ent)
                stats['valid_err_desc_ent'].append(error_dent)

                stats['valid_alphas_mean'].append(mean_valid_alphas)
                stats['valid_alphas_std'].append(std_valid_alphas)
                stats['valid_alphas_ent'].append(valid_alpha_ent)

                stats['valid_probs_mean'].append(mean_valid_probs)
                stats['valid_probs_std'].append(std_valid_probs)

                if uidx == 0 or valid_err <= numpy.array(history_errs)[:, 1].min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                    best_found = True
                else:
                    bst_found = False

                if numpy.isnan(valid_err):
                    import ipdb; ipdb.set_trace()


                print "============================"
                print '\t>>>Valid error: ', valid_err, \
                        ' Valid cost: ', valid_cost
                print '\t>>>Valid pred mean: ', mean_valid_probs, \
                        ' Valid pred std: ', std_valid_probs
                print '\t>>>Valid alphas mean: ', mean_valid_alphas, \
                        ' Valid alphas std: ', std_valid_alphas, \
                        ' Valid alpha negent: ', valid_alpha_ent, \
                        ' Valid error ent: ', error_ent, \
                        ' Valid error desc ent: ', error_dent

                print "============================"
                print "Running average train stats "
                print '\t>>>Train error: ', train_err_ave, \
                        ' Train cost: ', train_cost_ave, \
                        ' Train grad norm: ', train_gnorm_ave
                print "============================"


                train_cost_ave, train_err_ave, \
                    train_gnorm_ave = reset_train_vals()


        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid.reset()
    valid_cost, valid_error, valid_probs, \
            valid_alphas, error_ent = eval_model(f_log_probs,
                                      prepare_data if not opt_ds['use_sent_reps'] \
                                           else prepare_data_sents,
                                      model_options, valid,
                                      use_sent_rep=opt_ds['use_sent_rep'])

    print " Final eval resuts: "
    print 'Valid error: ', valid_error.mean()
    print 'Valid cost: ', valid_cost.mean()
    print '\t>>>Valid pred mean: ', valid_probs.mean(), \
            ' Valid pred std: ', valid_probs.std(), \
            ' Valid error ent: ', error_ent

    params = copy.copy(best_p)

    numpy.savez(mpath_last,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err, valid_cost