Beispiel #1
0
def adadelta(lr, tparams, grads, inp, cost):
    zipped_grads = [theano.shared(p.get_value() * np.float32(0.),
                                  name='%s_grad' % k)
                    for k, p in six.iteritems(tparams)]
    running_up2 = [theano.shared(p.get_value() * np.float32(0.),
                                 name='%s_rup2' % k)
                   for k, p in six.iteritems(tparams)]
    running_grads2 = [theano.shared(p.get_value() * np.float32(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in six.iteritems(tparams)]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp,
                                    cost,
                                    updates=zgup + rg2up,
                                    name='adadelta')

    updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)
             ]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]

    f_update = theano.function([lr],
                               [],
                               updates=ru2up + param_up,
                               on_unused_input='ignore')

    return f_grad_shared, f_update
Beispiel #2
0
def adadelta(lr, tparams, grads, inp, cost):
    zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.iteritems()]
    running_up2 = [theano.shared(p.get_value() * numpy.float32(0.),
                                 name='%s_rup2' % k)
                   for k, p in tparams.iteritems()]
    running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp, cost, updates=zgup+rg2up)

    updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads,
                                     running_up2,
                                     running_grads2)]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(itemlist(tparams), updir)]

    f_update = theano.function([lr], [], updates=ru2up+param_up,
                               on_unused_input='ignore')

    return f_grad_shared, f_update
Beispiel #3
0
def rmsprop(lr, tparams, grads, inp, cost):
    zipped_grads = [theano.shared(p.get_value() * numpy.float32(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.iteritems()]
    running_grads = [theano.shared(p.get_value() * numpy.float32(0.),
                                   name='%s_rgrad' % k)
                     for k, p in tparams.iteritems()]
    running_grads2 = [theano.shared(p.get_value() * numpy.float32(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp, cost, updates=zgup + rgup + rg2up,
                                    profile=profile)

    updir = [theano.shared(p.get_value() * numpy.float32(0.),
                           name='%s_updir' % k)
             for k, p in tparams.iteritems()]
    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                            running_grads2)]
    param_up = [(p, p + udn[1])
                for p, udn in zip(itemlist(tparams), updir_new)]
    f_update = theano.function([lr], [], updates=updir_new + param_up,
                               on_unused_input='ignore', profile=profile)

    return f_grad_shared, f_update
Beispiel #4
0
def rmsprop(lr, tparams, grads, inp, cost):
    zipped_grads = [theano.shared(p.get_value() * np.float32(0.),
                                  name='%s_grad' % k)
                    for k, p in six.iteritems(tparams)]
    running_grads = [theano.shared(p.get_value() * np.float32(0.),
                                   name='%s_rgrad' % k)
                     for k, p in six.iteritems(tparams)]
    running_grads2 = [theano.shared(p.get_value() * np.float32(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in six.iteritems(tparams)]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp,
                                    cost,
                                    updates=zgup + rgup + rg2up,
                                    name='rmsprop')

    updir = [theano.shared(p.get_value() * np.float32(0.),
                           name='%s_updir' % k)
             for k, p in six.iteritems(tparams)]
    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                            running_grads2)]
    param_up = [(p, p + udn[1]) for p, udn in zip(
        itemlist(tparams), updir_new)]
    f_update = theano.function([lr],
                               [],
                               updates=updir_new + param_up,
                               on_unused_input='ignore')

    return f_grad_shared, f_update
Beispiel #5
0
def sgd(lr, tparams, grads, x, mask, y, cost):
    gshared = [theano.shared(p.get_value() * 0.0, name="%s_grad" % k) for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, profile=profile)

    pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
    f_update = theano.function([lr], [], updates=pup, profile=profile)

    return f_grad_shared, f_update
Beispiel #6
0
def sgd(lr, tparams, grads, x, mask, y, cost):
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    f_grad_shared = theano.function([x, mask, y], cost, updates=gsup, profile=profile)

    pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
    f_update = theano.function([lr], [], updates=pup, profile=profile)

    return f_grad_shared, f_update
Beispiel #7
0
def sgd(lr, tparams, grads, inp, cost):
    gshared = [
        theano.shared(p.get_value() * numpy.float32(0.), name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    f_grad_shared = theano.function(inp, cost, updates=gsup, profile=False)

    pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
    f_update = theano.function([lr], [], updates=pup, profile=False)

    return f_grad_shared, f_update
Beispiel #8
0
def sgd(lr, tparams, grads, inp, cost):
    gshared = [
        theano.shared(p.get_value() * 0., name='%s_grad' % k)
        for k, p in six.iteritems(tparams)
    ]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    f_grad_shared = theano.function(inp, cost, updates=gsup, name='sgd')

    pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
    f_update = theano.function([lr], [], updates=pup)

    return f_grad_shared, f_update, []
def sgd(lr, tparams, grads, inp, cost, use_noise,**kwargs):
    print 'Using SGD'
    gshared = [theano.shared(p.get_value() * floatx(0.), name='%s_grad' % k)
               for k, p in tparams.iteritems()]
    gsup = [(gs, gs + g) for gs, g in zip(gshared, grads)]

    f_grad_shared = theano.function(inp, cost, givens={use_noise: numpy.float32(1.)},
                                    on_unused_input='warn', updates=gsup, allow_input_downcast=True)

    pup = [(p, p - lr * (g))
           for p, g in zip(itemlist(tparams), gshared)]
    f_update = theano.function([lr], [], updates=pup, allow_input_downcast=True)

    return f_grad_shared, f_update, gshared
Beispiel #10
0
def adadelta(lr, tparams, grads, inp, cost, errors):
    gnorm = get_norms(grads)
    pnorm = get_norms(tparams.values())

    zipped_grads = [
        sharedX(p.get_value() * numpy.float32(0.), name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]

    running_up2 = [
        sharedX(p.get_value() * numpy.float32(0.), name='%s_rup2' % k)
        for k, p in tparams.iteritems()
    ]

    running_grads2 = [
        sharedX(p.get_value() * numpy.float32(0.), name='%s_rgrad2' % k)
        for k, p in tparams.iteritems()
    ]

    zgup = [(zg, g) for zg, g in \
            zip(zipped_grads, grads)]

    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) \
                for rg2, g in \
                zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp,
                                    [cost, errors, gnorm, pnorm], \
                                    updates=zgup + rg2up)

    updir = [-tensor.sqrt(ru2 + 1e-6) / \
                tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 \
                in zip(zipped_grads, running_up2, running_grads2)]

    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) \
               for ru2, ud in zip(running_up2, updir)]

    param_up = [(p, p + ud) for p, ud in \
                   zip(itemlist(tparams), updir)]

    upnorm = get_norms(updir)
    f_update = theano.function([lr], [upnorm],
                               updates=ru2up + param_up,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
Beispiel #11
0
def sgd(lr, tparams, grads, x, mask, y, cost, errors):
    gshared = [sharedX(p.get_value() * 0.,
                       name='%s_grad'%k) for k, p \
                               in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    pnorm = get_norms(tparams.values())
    gnorm = get_norms(grads)

    f_grad_shared = theano.function([x, mask, y],
                                    [cost, errors, gnorm, pnorm],
                                    updates=gsup, profile=profile)

    pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
    upnorm = lr*gnorm
    f_update = theano.function([lr], [upnorm], updates=pup, profile=profile)

    return f_grad_shared, f_update
Beispiel #12
0
def sgd(lr, tparams, grads, x, mask, y, cost, errors):
    gshared = [sharedX(p.get_value() * 0.,
                       name='%s_grad'%k) for k, p \
                               in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]

    pnorm = get_norms(tparams.values())
    gnorm = get_norms(grads)

    f_grad_shared = theano.function([x, mask, y], [cost, errors, gnorm, pnorm],
                                    updates=gsup,
                                    profile=profile)

    pup = [(p, p - lr * g) for p, g in zip(itemlist(tparams), gshared)]
    upnorm = lr * gnorm
    f_update = theano.function([lr], [upnorm], updates=pup, profile=profile)

    return f_grad_shared, f_update
Beispiel #13
0
def adadelta(lr, tparams, grads, inp, cost, errors):
    gnorm = get_norms(grads)
    pnorm = get_norms(tparams.values())

    zipped_grads = [sharedX(p.get_value() * numpy.float32(0.),
                                  name='%s_grad'%k)
                    for k, p in tparams.iteritems()]

    running_up2 = [sharedX(p.get_value() * numpy.float32(0.),
                                 name='%s_rup2'%k)
                   for k, p in tparams.iteritems()]

    running_grads2 = [sharedX(p.get_value() * numpy.float32(0.),
                                    name='%s_rgrad2'%k)
                      for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in \
            zip(zipped_grads, grads)]

    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) \
                for rg2, g in \
                zip(running_grads2, grads)]

    f_grad_shared = theano.function(inp,
                                    [cost, errors, gnorm, pnorm], \
                                    updates=zgup + rg2up)

    updir = [-tensor.sqrt(ru2 + 1e-6) / \
                tensor.sqrt(rg2 + 1e-6) * zg for zg, ru2, rg2 \
                in zip(zipped_grads, running_up2, running_grads2)]

    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2)) \
               for ru2, ud in zip(running_up2, updir)]

    param_up = [(p, p + ud) for p, ud in \
                   zip(itemlist(tparams), updir)]

    upnorm = get_norms(updir)
    f_update = theano.function([lr], [upnorm],
                               updates=ru2up+param_up,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
Beispiel #14
0
def rmsprop(lr, tparams, grads, inp, cost, errors):
    zipped_grads = [sharedX(p.get_value() * numpy.float32(0.), \
            name='%s_grad'%k) for k, p in tparams.iteritems()]

    running_grads = [sharedX(p.get_value() * numpy.float32(0.), \
            name='%s_rgrad'%k) for k, p in tparams.iteritems()]

    running_grads2 = [sharedX(p.get_value() * numpy.float32(0.), \
            name='%s_rgrad2'%k) for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g \
            in zip(running_grads2, grads)]

    pnorm = get_norms(tparams.values())
    gnorm = get_norms(grads)

    f_grad_shared = theano.function(inp,
                                    [cost, errors, gnorm, pnorm],
                                    updates=zgup+rgup+rg2up,
                                    profile=profile)

    updir = [sharedX(p.get_value() * numpy.float32(0.),
                     name='%s_updir'%k) \
                     for k, p in tparams.iteritems()]

    updir_new = [(ud, 0.9 * ud - lr * zg / \
                 tensor.maximum(tensor.sqrt(rg2 - rg ** 2 + 1e-8)), 1e-8) \
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, \
                        running_grads, running_grads2)]

    param_up = [(p, p + udn[1]) for p, udn in \
                zip(itemlist(tparams), updir_new)]

    upnorm = get_norms(updir_new)
    f_update = theano.function([lr],
                               [upnorm],
                               updates=updir_new+param_up,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
Beispiel #15
0
def rmsprop(lr, tparams, grads, inp, cost, errors):
    zipped_grads = [sharedX(p.get_value() * numpy.float32(0.), \
            name='%s_grad'%k) for k, p in tparams.iteritems()]

    running_grads = [sharedX(p.get_value() * numpy.float32(0.), \
            name='%s_rgrad'%k) for k, p in tparams.iteritems()]

    running_grads2 = [sharedX(p.get_value() * numpy.float32(0.), \
            name='%s_rgrad2'%k) for k, p in tparams.iteritems()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g \
            in zip(running_grads2, grads)]

    pnorm = get_norms(tparams.values())
    gnorm = get_norms(grads)

    f_grad_shared = theano.function(inp, [cost, errors, gnorm, pnorm],
                                    updates=zgup + rgup + rg2up,
                                    profile=profile)

    updir = [sharedX(p.get_value() * numpy.float32(0.),
                     name='%s_updir'%k) \
                     for k, p in tparams.iteritems()]

    updir_new = [(ud, 0.9 * ud - lr * zg / \
                 tensor.maximum(tensor.sqrt(rg2 - rg ** 2 + 1e-8)), 1e-8) \
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, \
                        running_grads, running_grads2)]

    param_up = [(p, p + udn[1]) for p, udn in \
                zip(itemlist(tparams), updir_new)]

    upnorm = get_norms(updir_new)
    f_update = theano.function([lr], [upnorm],
                               updates=updir_new + param_up,
                               on_unused_input='ignore',
                               profile=profile)

    return f_grad_shared, f_update
Beispiel #16
0
def train(dim_word=100,  # word vector dimensionality
          dim=1000,  # the number of GRU units
          encoder='gru',
          patience=10,  # early stopping patience
          max_epochs=5000,
          finish_after=10000000,  # finish after this many updates
          dispFreq=100,
          decay_c=0.,  # L2 weight decay penalty
          lrate=0.01,
          n_words=100000,  # vocabulary size
          vocab_dim=100000,  # Size of M, C
          memory_dim=1000,  # Dimension of memory
          memory_size=15,  # n_back to attend
          maxlen=100,  # maximum length of the description
          optimizer='rmsprop',
          batch_size=16,
          valid_batch_size=16,
          saveto='model.npz',
          validFreq=1000,
          saveFreq=1000,  # save the parameters after every saveFreq updates
          sampleFreq=100,  # generate some samples after every sampleFreq
          dataset='/data/lisatmp3/chokyun/wikipedia/extracted/wiki.tok.txt.gz',
          valid_dataset='../data/dev/newstest2011.en.tok',
          dictionary='/data/lisatmp3/chokyun/wikipedia/extracted/'
          'wiki.tok.txt.gz.pkl',
          use_dropout=False,
          reload_=False):

    # Model options
    model_options = locals().copy()

    # Theano random stream
    trng = RandomStreams(1234)

    # load dictionary
    with open(dictionary, 'rb') as f:
        worddicts = pkl.load(f)

    # invert dictionary
    worddicts_r = dict()
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    # initialize RMN
    rmn_ = RMN(model_options)

    print 'Building model'
    rmn_.init_params()

    # reload parameters
    if reload_ and os.path.exists(saveto):
        rmn_.load_params(saveto)

    # create shared variables for parameters
    tparams = rmn_.tparams

    # build the symbolic computational graph
    use_noise, x, x_mask, opt_ret, cost = rmn_.build_model()
    inps = [x, x_mask]

    print 'Buliding sampler'
    f_next = rmn_.build_sampler(trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    optimizer = getattr(importlib.import_module('optimizer'), optimizer)
    f_grad_shared, f_update = optimizer(lr, tparams, grads, inps, cost)
    print 'Done'

    print 'Optimization'

    history_errs = []
    uidx = 0
    estop = False
    bad_counter = 0

    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
        uidx = numpy.load(saveto)['uidx']
    best_p = None

    if validFreq == -1:
        validFreq = len(train[0])/batch_size
    if saveFreq == -1:
        saveFreq = len(train[0])/batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0])/batch_size

    # Training loop
    for eidx in xrange(max_epochs):
        n_samples = 0

        for x in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            # pad batch and create mask
            x, x_mask = prepare_data(x, maxlen=maxlen, n_words=n_words)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, uidx=uidx,
                            **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(5):
                    sample, score = rmn_.gen_sample(tparams, f_next,
                                                    trng=trng, maxlen=30,
                                                    argmax=False)
                    print 'Sample ', jj, ': ',
                    ss = sample
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r:
                            print worddicts_r[vv],
                        else:
                            print 'UNK',
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = rmn_.pred_probs(valid, f_log_probs, prepare_data)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print 'Valid ', valid_err

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = rmn_.pred_probs(f_log_probs, prepare_data,
                                model_options, valid).mean()

    print 'Valid ', valid_err

    params = copy.copy(best_p)
    numpy.savez(saveto, zipped_params=best_p,
                history_errs=history_errs,
                uidx=uidx,
                **params)

    return valid_err
def train(
        random_seed=1234,
        dim_word=256,  # word vector dimensionality
        ctx_dim=-1,  # context vector dimensionality, auto set
        dim=1000,  # the number of LSTM units
        n_layers_out=1,
        n_layers_init=1,
        encoder='none',
        encoder_dim=100,
        prev2out=False,
        ctx2out=False,
        patience=10,
        max_epochs=5000,
        dispFreq=100,
        decay_c=0.,
        alpha_c=0.,
        alpha_entropy_r=0.,
        lrate=0.01,
        selector=False,
        n_words=100000,
        maxlen=100,  # maximum length of the description
        optimizer='adadelta',
        clip_c=2.,
        batch_size=64,
        valid_batch_size=64,
        save_model_dir='/data/lisatmp3/yaoli/exp/capgen_vid/attention/test/',
        validFreq=10,
        saveFreq=10,  # save the parameters after every saveFreq updates
        sampleFreq=10,  # generate some samples after every sampleFreq updates
        metric='blue',
        dataset='youtube2text',
        video_feature='googlenet',
        use_dropout=False,
        reload_=False,
        from_dir=None,
        K=10,
        OutOf=240,
        verbose=True,
        debug=True):
    rng_numpy, rng_theano = utils.get_two_rngs()

    model_options = locals().copy()
    if 'self' in model_options:
        del model_options['self']
    with open('%smodel_options.pkl' % save_model_dir, 'wb') as f:
        pkl.dump(model_options, f)

    # instance model
    layers = Layers()
    model = Model()

    print 'Loading data'
    engine = data_engine.Movie2Caption('attention', dataset, video_feature,
                                       batch_size, valid_batch_size, maxlen,
                                       n_words, K, OutOf)
    model_options['ctx_dim'] = engine.ctx_dim
    model_options['n_words'] = engine.n_words
    print 'n_words:', model_options['n_words']

    # set test values, for debugging
    idx = engine.kf_train[0]
    [x_tv, mask_tv, ctx_tv, ctx_mask_tv
     ] = data_engine.prepare_data(engine,
                                  [engine.train[index] for index in idx])

    print 'init params'
    t0 = time.time()
    params = model.init_params(model_options)

    # reloading
    if reload_:
        model_saved = from_dir + '/model_best_so_far.npz'
        assert os.path.isfile(model_saved)
        print "Reloading model params..."
        params = utils.load_params(model_saved, params)

    tparams = utils.init_tparams(params)

    trng, use_noise, \
          x, mask, ctx, mask_ctx, \
          cost, extra = \
          model.build_model(tparams, model_options)
    alphas = extra[1]
    betas = extra[2]
    print 'buliding sampler'
    f_init, f_next = model.build_sampler(tparams, model_options, use_noise,
                                         trng)
    # before any regularizer
    print 'building f_log_probs'
    f_log_probs = theano.function([x, mask, ctx, mask_ctx],
                                  -cost,
                                  profile=False,
                                  on_unused_input='ignore')

    cost = cost.mean()
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    if alpha_c > 0.:
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * ((1. - alphas.sum(0))**2).sum(-1).mean()
        cost += alpha_reg

    if alpha_entropy_r > 0:
        alpha_entropy_r = theano.shared(numpy.float32(alpha_entropy_r),
                                        name='alpha_entropy_r')
        alpha_reg_2 = alpha_entropy_r * (-tensor.sum(
            alphas * tensor.log(alphas + 1e-8), axis=-1)).sum(-1).mean()
        cost += alpha_reg_2
    else:
        alpha_reg_2 = tensor.zeros_like(cost)
    print 'building f_alpha'
    f_alpha = theano.function([x, mask, ctx, mask_ctx], [alphas, betas],
                              name='f_alpha',
                              on_unused_input='ignore')

    print 'compute grad'
    grads = tensor.grad(cost, wrt=utils.itemlist(tparams))
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    lr = tensor.scalar(name='lr')
    print 'build train fns'
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads,
                                              [x, mask, ctx, mask_ctx], cost,
                                              extra + grads)

    print 'compilation took %.4f sec' % (time.time() - t0)
    print 'Optimization'

    history_errs = []
    # reload history
    if reload_:
        print 'loading history error...'
        history_errs = numpy.load(
            from_dir + 'model_best_so_far.npz')['history_errs'].tolist()

    bad_counter = 0

    processes = None
    queue = None
    rqueue = None
    shared_params = None

    uidx = 0
    uidx_best_blue = 0
    uidx_best_valid_err = 0
    estop = False
    best_p = utils.unzip(tparams)
    best_blue_valid = 0
    best_valid_err = 999
    alphas_ratio = []
    for eidx in xrange(max_epochs):
        n_samples = 0
        train_costs = []
        grads_record = []
        print 'Epoch ', eidx
        for idx in engine.kf_train:
            tags = [engine.train[index] for index in idx]
            n_samples += len(tags)
            uidx += 1
            use_noise.set_value(1.)

            pd_start = time.time()
            x, mask, ctx, ctx_mask = data_engine.prepare_data(engine, tags)
            pd_duration = time.time() - pd_start
            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                continue

            ud_start = time.time()
            rvals = f_grad_shared(x, mask, ctx, ctx_mask)
            cost = rvals[0]
            probs = rvals[1]
            alphas = rvals[2]
            betas = rvals[3]
            grads = rvals[4:]
            grads, NaN_keys = utils.grad_nan_report(grads, tparams)
            if len(grads_record) >= 5:
                del grads_record[0]
            grads_record.append(grads)
            if NaN_keys != []:
                print 'grads contain NaN'
                import pdb
                pdb.set_trace()
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected in cost'
                import pdb
                pdb.set_trace()
            # update params
            f_update(lrate)
            ud_duration = time.time() - ud_start

            if eidx == 0:
                train_error = cost
            else:
                train_error = train_error * 0.95 + cost * 0.05
            train_costs.append(cost)

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Train cost mean so far', \
                  train_error, 'fetching data time spent (sec)', pd_duration, \
                  'update time spent (sec)', ud_duration, 'save_dir', save_model_dir
                alphas, betas = f_alpha(x, mask, ctx, ctx_mask)
                counts = mask.sum(0)
                betas_mean = (betas * mask).sum(0) / counts
                betas_mean = betas_mean.mean()
                print 'alpha ratio %.3f, betas mean %.3f' % (
                    alphas.min(-1).mean() /
                    (alphas.max(-1)).mean(), betas_mean)
                l = 0
                for vv in x[:, 0]:
                    if vv == 0:
                        break
                    if vv in engine.word_idict:
                        print '(', numpy.round(betas[l, 0],
                                               3), ')', engine.word_idict[vv],
                    else:
                        print '(', numpy.round(betas[l, 0], 3), ')', 'UNK',
                    l += 1
                print '(', numpy.round(betas[l, 0], 3), ')'

            if numpy.mod(uidx, saveFreq) == 0:
                pass

            if numpy.mod(uidx, sampleFreq) == 0:
                use_noise.set_value(0.)
                print '------------- sampling from train ----------'
                x_s = x
                mask_s = mask
                ctx_s = ctx
                ctx_mask_s = ctx_mask
                model.sample_execute(engine, model_options, tparams, f_init,
                                     f_next, x_s, ctx_s, ctx_mask_s, trng)
                print '------------- sampling from valid ----------'
                idx = engine.kf_valid[numpy.random.randint(
                    1,
                    len(engine.kf_valid) - 1)]
                tags = [engine.valid[index] for index in idx]
                x_s, mask_s, ctx_s, mask_ctx_s = data_engine.prepare_data(
                    engine, tags)
                model.sample_execute(engine, model_options, tparams, f_init,
                                     f_next, x_s, ctx_s, mask_ctx_s, trng)

            if validFreq != -1 and numpy.mod(uidx, validFreq) == 0:
                t0_valid = time.time()
                alphas, _ = f_alpha(x, mask, ctx, ctx_mask)
                ratio = alphas.min(-1).mean() / (alphas.max(-1)).mean()
                alphas_ratio.append(ratio)
                numpy.savetxt(save_model_dir + 'alpha_ratio.txt', alphas_ratio)

                current_params = utils.unzip(tparams)
                numpy.savez(save_model_dir + 'model_current.npz',
                            history_errs=history_errs,
                            **current_params)

                use_noise.set_value(0.)
                train_err = -1
                train_perp = -1
                valid_err = -1
                valid_perp = -1
                test_err = -1
                test_perp = -1
                if not debug:
                    # first compute train cost
                    if 0:
                        print 'computing cost on trainset'
                        train_err, train_perp = model.pred_probs(
                            engine,
                            'train',
                            f_log_probs,
                            verbose=model_options['verbose'])
                    else:
                        train_err = 0.
                        train_perp = 0.
                    if 1:
                        print 'validating...'
                        valid_err, valid_perp = model.pred_probs(
                            engine,
                            'valid',
                            f_log_probs,
                            verbose=model_options['verbose'],
                        )
                    else:
                        valid_err = 0.
                        valid_perp = 0.
                    if 1:
                        print 'testing...'
                        test_err, test_perp = model.pred_probs(
                            engine,
                            'test',
                            f_log_probs,
                            verbose=model_options['verbose'])
                    else:
                        test_err = 0.
                        test_perp = 0.

                mean_ranking = 0
                blue_t0 = time.time()
                scores, processes, queue, rqueue, shared_params = \
                    metrics.compute_score(
                    model_type='attention',
                    model_archive=current_params,
                    options=model_options,
                    engine=engine,
                    save_dir=save_model_dir,
                    beam=5, n_process=5,
                    whichset='both',
                    on_cpu=False,
                    processes=processes, queue=queue, rqueue=rqueue,
                    shared_params=shared_params, metric=metric,
                    one_time=False,
                    f_init=f_init, f_next=f_next, model=model
                    )
                '''
                 {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]},
                 'alternative_valid': {'Bleu_3': 0.40702270203174923,
                 'Bleu_4': 0.29276570520368456,
                 'CIDEr': 0.25247168210607884,
                 'Bleu_2': 0.529069629270047,
                 'Bleu_1': 0.6804308797115253,
                 'ROUGE_L': 0.51083584331688392},
                 'meteor': {'test': [-1], 'valid': [0.282787550236724]}}
                '''

                valid_B1 = scores['valid']['Bleu_1']
                valid_B2 = scores['valid']['Bleu_2']
                valid_B3 = scores['valid']['Bleu_3']
                valid_B4 = scores['valid']['Bleu_4']
                valid_Rouge = scores['valid']['ROUGE_L']
                valid_Cider = scores['valid']['CIDEr']
                valid_meteor = scores['valid']['METEOR']
                test_B1 = scores['test']['Bleu_1']
                test_B2 = scores['test']['Bleu_2']
                test_B3 = scores['test']['Bleu_3']
                test_B4 = scores['test']['Bleu_4']
                test_Rouge = scores['test']['ROUGE_L']
                test_Cider = scores['test']['CIDEr']
                test_meteor = scores['test']['METEOR']
                print 'computing meteor/blue score used %.4f sec, '\
                  'blue score: %.1f, meteor score: %.1f'%(
                time.time()-blue_t0, valid_B4, valid_meteor)
                history_errs.append([
                    eidx, uidx, train_err, train_perp, valid_perp, test_perp,
                    valid_err, test_err, valid_B1, valid_B2, valid_B3,
                    valid_B4, valid_meteor, valid_Rouge, valid_Cider, test_B1,
                    test_B2, test_B3, test_B4, test_meteor, test_Rouge,
                    test_Cider
                ])
                numpy.savetxt(save_model_dir + 'train_valid_test.txt',
                              history_errs,
                              fmt='%.3f')
                print 'save validation results to %s' % save_model_dir
                # save best model according to the best blue or meteor
                if len(history_errs) > 1 and \
                  valid_B4 > numpy.array(history_errs)[:-1,11].max():
                    print 'Saving to %s...' % save_model_dir,
                    numpy.savez(save_model_dir +
                                'model_best_blue_or_meteor.npz',
                                history_errs=history_errs,
                                **best_p)
                if len(history_errs) > 1 and \
                  valid_err < numpy.array(history_errs)[:-1,6].min():
                    best_p = utils.unzip(tparams)
                    bad_counter = 0
                    best_valid_err = valid_err
                    uidx_best_valid_err = uidx

                    print 'Saving to %s...' % save_model_dir,
                    numpy.savez(save_model_dir + 'model_best_so_far.npz',
                                history_errs=history_errs,
                                **best_p)
                    with open('%smodel_options.pkl' % save_model_dir,
                              'wb') as f:
                        pkl.dump(model_options, f)
                    print 'Done'
                elif len(history_errs) > 1 and \
                    valid_err >= numpy.array(history_errs)[:-1,6].min():
                    bad_counter += 1
                    print 'history best ', numpy.array(history_errs)[:,
                                                                     6].min()
                    print 'bad_counter ', bad_counter
                    print 'patience ', patience
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if test_B4 > 0.52 and test_meteor > 0.32:
                    print 'Saving to %s...' % save_model_dir,
                    numpy.savez(save_model_dir + 'model_' + str(uidx) + '.npz',
                                history_errs=history_errs,
                                **current_params)

                print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \
                  'best valid err so far',best_valid_err
                print 'valid took %.2f sec' % (time.time() - t0_valid)
                # end of validatioin
            if debug:
                break
        if estop:
            break
        if debug:
            break

        # end for loop over minibatches
        print 'This epoch has seen %d samples, train cost %.2f' % (
            n_samples, numpy.mean(train_costs))
    # end for loop over epochs
    print 'Optimization ended.'
    if best_p is not None:
        utils.zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = 0
    test_err = 0
    if not debug:
        #if valid:
        valid_err, valid_perp = model.pred_probs(
            engine, 'valid', f_log_probs, verbose=model_options['verbose'])
        #if test:
        #test_err, test_perp = self.pred_probs(
        #    'test', f_log_probs,
        #    verbose=model_options['verbose'])


    print 'stopped at epoch %d, minibatch %d, '\
      'curent Train %.2f, current Valid %.2f, current Test %.2f '%(
          eidx,uidx,numpy.mean(train_err),numpy.mean(valid_err),numpy.mean(test_err))
    params = copy.copy(best_p)
    numpy.savez(save_model_dir + 'model_best.npz',
                train_err=train_err,
                valid_err=valid_err,
                test_err=test_err,
                history_errs=history_errs,
                **params)

    if history_errs != []:
        history = numpy.asarray(history_errs)
        best_valid_idx = history[:, 6].argmin()
        numpy.savetxt(save_model_dir + 'train_valid_test.txt',
                      history,
                      fmt='%.4f')
        print 'final best exp ', history[best_valid_idx]

    return train_err, valid_err, test_err
Beispiel #18
0
def train(
        experiment_id,
        data_base_path,
        output_base_path,
        model_options,
        data_options,
        validation_options,
        patience,  # early stopping patience
        max_epochs,
        finish_after,  # finish after this many updates
        clip_c,  # gradient clipping threshold
        lrate,  # learning rate
        optimizer,
        saveto,
        valid_freq,
        time_limit,
        save_freq,  # save the parameters after every saveFreq updates
        sample_freq,  # generate some samples after every sampleFreq
        verbose,
        reload_from=None,
        pretrained_word_emb=None):

    start_time = time.time()

    def join_data_base_path(data_base, options):
        for kk, vv in six.iteritems(options):
            if kk in [
                    'src', 'trg', 'input_vocab', 'label_vocab', 'valid_src',
                    'valid_trg'
            ]:
                options[kk] = os.path.join(data_base, options[kk])

        return options

    data_options = join_data_base_path(data_base_path, data_options)
    validation_options = join_data_base_path(data_base_path,
                                             validation_options)

    worddicts_r, train_stream, valid_stream = load_data(**data_options)

    model_options['n_input_tokens'] = len(worddicts_r[0])
    model_options['n_labels'] = len(worddicts_r[1])

    if model_options['label_type'] == 'binary':
        model_options['n_bins'] = len(worddicts_r[1])
        model_options['n_labels'] = 2
        max_sample_length = len(worddicts_r[1])
    else:
        max_sample_length = data_options['max_label_length']

    LOGGER.info('Building model')
    params = init_params(model_options)
    # reload parameters
    best_filename = '{}/{}.{}.best.npz'.format(output_base_path, experiment_id,
                                               saveto)

    if pretrained_word_emb and os.path.exists(pretrained_word_emb):
        assert model_options['input_token_level'] == 'word'
        LOGGER.info('Loading pretrained word embeddings from {}'.format(
            pretrained_word_emb))

        pretrained_emb = load_pretrained_embeddings(pretrained_word_emb)

        # TODO check if the size of the pretrained word embedding equals
        # the size of the initialized word embeddings
        # Also, check whether or not the vocabulary in the pretrained word
        # embeddings is identical to the vocabulary in the model.
        pvocab = pretrained_emb['vocab']  # (idx, word)

        # XXX if the assertians passed, then load the pretrained embeddings
        assert pretrained_emb['Wemb'].dtype == numpy.float32, \
            'The pretrained word embeddings should be float32\n'
        assert pretrained_emb['Wemb'].shape[1] == params['Wemb'].shape[1], \
            '{} does not match {}\n'.format(pretrained_emb['Wemb'].shape[1],
                                            params['Wemb'].shape[1])

        pretrained_word2id = {word: idx for (idx, word) in pvocab}

        param_indices, indices = [], []
        for ii in xrange(len(worddicts_r[0])):

            if ii >= data_options['n_input_tokens']:
                break

            word = worddicts_r[0][ii]
            if word in pretrained_word2id:
                word_idx = pretrained_word2id[word]
                indices.append(word_idx)
                param_indices.append(ii)

        assert len(indices) <= data_options['n_input_tokens']

        params['Wemb'][param_indices] = pretrained_emb['Wemb'][indices]
        # normalize word embeddings
        params['Wemb'] = params['Wemb'] / \
            numpy.sqrt((params['Wemb']**2).sum(axis=1)[:, None])

    if reload_from and os.path.exists(reload_from):
        LOGGER.info('Loading parameters from {}'.format(reload_from))
        params = load_params(reload_from, params)

    LOGGER.info('Initializing parameters')
    tparams = init_tparams(params)

    # use_noise is for dropout
    trng, use_noise, encoder_vars, decoder_vars, \
        opt_ret, costs = build_model(tparams, model_options)

    inps = encoder_vars + decoder_vars

    LOGGER.info('Building sampler')
    f_sample_inits, f_sample_nexts \
        = build_sampler(tparams, model_options, trng, use_noise)

    # before any regularizer
    LOGGER.info('Building functions to compute log prob')
    f_log_probs = [
        theano.function(inps,
                        cost_,
                        name='f_log_probs_%s' % cost_.name,
                        on_unused_input='ignore') for cost_ in costs
    ]

    assert len(costs) == 1

    cost = costs[0]
    '''
    for cost_ in costs[1:]:
        cost += cost_
    '''
    cost = cost.mean()

    LOGGER.info('Computing gradient')
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')

    LOGGER.info('Building optimizers')
    f_grad_shared, f_update, optimizer_state = \
        getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost)

    optimizer_state = name_dict(optimizer_state)

    # TODO set_value optimizer_state
    if reload_from and os.path.exists(reload_from):
        LOGGER.info('Loading optimizer state from {}'.format(reload_from))
        optimizer_state = load_params(reload_from,
                                      optimizer_state,
                                      theano_var=True)

    LOGGER.info('Optimization')

    log = Logger(
        filename='{}/{}.log.jsonl.gz'.format(output_base_path, experiment_id))

    best_valid_err = float('inf')
    best_model = None
    total_nsamples = 0
    uidx = 0
    uidx_restore = [0]
    estop = False
    if reload_from and os.path.exists(reload_from):
        rmodel = numpy.load(reload_from)
        if 'uidx' in rmodel:
            uidx_restore = rmodel['uidx']
        if 'best_valid_err' in rmodel:
            best_valid_err = rmodel['best_valid_err']
        if 'total_nsamples' in rmodel and rmodel['total_nsamples'] > 0:
            total_nsamples = rmodel['total_nsamples']

        best_model = [unzip(tparams), unzip(optimizer_state), uidx_restore]

    train_start = time.clock()
    max_updates_per_epoch = total_nsamples / data_options['batch_size']

    try:
        for epoch in xrange(0, max_epochs):
            if total_nsamples > 0 and \
                    uidx + max_updates_per_epoch < uidx_restore[0]:
                uidx += max_updates_per_epoch
                continue

            n_samples = 0
            for x, x_mask, \
                    y, y_mask in train_stream.get_epoch_iterator():

                n_samples += len(x)

                uidx += 1
                if uidx < uidx_restore[0]:
                    continue

                x_length = x_mask.sum(1).mean()

                if model_options['label_type'] == 'binary':
                    old_y = y
                    y, y_mask = mul2bin(y, y_mask, model_options['n_bins'])

                y, y_mask = y.T, y_mask.T

                if data_options['input_token_level'] == 'character':
                    x, x_mask = prepare_character_tensor(x)
                else:
                    x, x_mask = x.T, x_mask.T

                unk_token_ratio = (x == 1).sum(0) / x_mask.sum(0)
                non_empty_insts = unk_token_ratio <= 0.5

                y = y[:, non_empty_insts]
                y_mask = y_mask[:, non_empty_insts]
                x = x[:, non_empty_insts]
                x_mask = x_mask[:, non_empty_insts]

                if x.shape[1] == 0:
                    continue

                encoder_inps = [x, x_mask]
                decoder_inps = [y, y_mask]

                inps = encoder_inps + decoder_inps

                use_noise.set_value(1.)

                log_entry = {'iteration': uidx, 'epoch': epoch}

                # compute cost, grads and copy grads to shared variables
                update_start = time.clock()
                cost = f_grad_shared(*inps)
                f_update(lrate)

                if verbose:
                    log_entry['cost'] = float(cost)
                    log_entry['average_source_length'] = \
                        float(x_length)
                    log_entry['average_target_length'] = \
                        float(y_mask.sum(0).mean())
                    log_entry['update_time'] = time.clock() - update_start
                    log_entry['train_time'] = time.clock() - train_start

                # check for bad numbers, usually we remove non-finite elements
                # and continue training - but not done here
                if not numpy.isfinite(cost):
                    LOGGER.error('NaN detected')
                    return 1., 1., 1.

                # validate model on validation set and early stop if necessary
                if numpy.mod(uidx, valid_freq) == 0:
                    use_noise.set_value(0.)
                    valid_errs = [
                        numpy.mean(pred_probs(f_, model_options, valid_stream))
                        for f_ in f_log_probs
                    ]

                    for f_, err_ in zip(f_log_probs, valid_errs):
                        log_entry['validation_%s' % f_.name] = float(err_)

                    valid_scores = do_validation(f_sample_inits,
                                                 f_sample_nexts, tparams,
                                                 max_sample_length, trng,
                                                 model_options, valid_stream)

                    for eval_type, score in valid_scores.items():
                        log_entry['validation_%s' % eval_type] = score

                    for f_, err_ in zip(f_log_probs, valid_errs):
                        if not numpy.isfinite(err_):
                            raise RuntimeError(('NaN detected in validation '
                                                'error of %s') % f_.name)
                    valid_err = numpy.array(valid_errs).sum()

                    if valid_err < best_valid_err:
                        best_valid_err = valid_err
                        best_model = [
                            unzip(tparams),
                            unzip(optimizer_state), [uidx]
                        ]

                # save the best model so far
                if numpy.mod(uidx, save_freq) == 0 and \
                        uidx > uidx_restore[0]:
                    LOGGER.info('Saving best model so far')

                    if best_model is not None:
                        params, opt_state, save_at_uidx = best_model
                    else:
                        params = unzip(tparams)
                        opt_state = unzip(optimizer_state)
                        save_at_uidx = [uidx]

                    # save params to exp_id.npz and symlink model.npz to it
                    params_and_state = merge(
                        params, opt_state, {'uidx': save_at_uidx},
                        {'best_valid_err': best_valid_err},
                        {'total_nsamples': total_nsamples})
                    save_params(params_and_state, best_filename)

                # generate some samples with the model and display them
                if sample_freq > 0 and numpy.mod(uidx, sample_freq) == 0:
                    # FIXME: random selection?
                    log_entry['samples'] = []

                    if data_options['input_token_level'] == 'character':
                        batch_size = x.shape[2]
                    else:
                        batch_size = x.shape[1]
                    for jj in xrange(numpy.minimum(5, batch_size)):
                        stats = [('source', ''), ('truth', ''), ('sample', ''),
                                 ('align_sample', '')]
                        log_entry['samples'].append(OrderedDict(stats))

                        if data_options['input_token_level'] == 'character':
                            sample_encoder_inps = [
                                x[:, :, jj][:, :, None], x_mask[:, :, jj][:, :,
                                                                          None]
                            ]
                        else:
                            sample_encoder_inps = [
                                x[:, jj][:, None], x_mask[:, jj][:, None]
                            ]

                        solutions = gen_sample(tparams,
                                               f_sample_inits,
                                               f_sample_nexts,
                                               sample_encoder_inps,
                                               model_options,
                                               trng=trng,
                                               k=12,
                                               max_label_len=max_sample_length,
                                               argmax=False)

                        sample = solutions['samples']
                        alignment = solutions['alignments']
                        score = solutions['scores']

                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                        alignment = alignment[score.argmin()]

                        if model_options['label_type'] == 'binary':
                            # print(y[0], y.shape, old_y.shape)
                            y = old_y.T
                            assert type(ss) == list
                            assert len(ss) == max_sample_length
                            new_ss = [
                                tidx for tidx, s in enumerate(ss) if s == 1
                            ]
                            # print(len(ss), numpy.sum(ss), new_ss)
                            ss = new_ss
                            assert type(ss) == list

                            if len(ss) == 0:
                                ss.append(0)

                            if ss[0] == 0:  # if the first token is <EOS>
                                ss = ss[1:] + ss[:1]

                        if data_options['input_token_level'] == 'character':
                            num_src_words = int(
                                (x_mask[:, :, jj].sum(0) > 0).sum())

                            num_chars, num_words, num_samples = x.shape
                            for widx in xrange(num_words):
                                if x_mask[:, widx, jj].sum() == 0:
                                    break
                                for cidx in xrange(num_chars):
                                    cc = x[cidx, widx, jj]
                                    if cc == 0:
                                        break
                                    if cc in worddicts_r[0]:
                                        token = worddicts_r[0][cc]
                                    else:
                                        token = UNK_TOKEN
                                    log_entry['samples'][-1]['source'] \
                                        += token
                                log_entry['samples'][-1]['source'] += \
                                    ' '
                        else:
                            num_src_words = int(x_mask[:, jj].sum())

                            num_words, num_samples = x.shape
                            for vv in x[:, jj]:
                                if vv == 0:
                                    break
                                if vv in worddicts_r[0]:
                                    token = worddicts_r[0][vv]
                                else:
                                    token = UNK_TOKEN
                                log_entry['samples'][-1]['source'] \
                                    += token + ' '

                        for vv in y[:, jj]:
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                token = worddicts_r[1][vv]
                            else:
                                token = UNK_TOKEN
                            log_entry['samples'][-1]['truth'] += token + ' '

                        for tidx, vv in enumerate(ss):
                            if vv == 0:
                                break
                            if vv in worddicts_r[1]:
                                token = worddicts_r[1][vv]
                            else:
                                token = UNK_TOKEN

                            assert tidx >= 0 and tidx < len(alignment), \
                                '%d\t%d' % (tidx, len(alignment))

                            align_src_word_idx = \
                                (alignment[tidx][
                                    :num_src_words-1]).argmax()
                            aligned_token = '%s_<%d>' % \
                                (token, align_src_word_idx)

                            log_entry['samples'][-1]['sample'] += token + ' '
                            log_entry['samples'][-1]['align_sample'] \
                                += aligned_token + ' '

                # finish after this many updates
                if uidx >= finish_after:
                    LOGGER.info('Finishing after {} iterations'.format(uidx))
                    estop = True
                    break

                if time_limit > 0 and \
                   (time.time() - start_time > time_limit * 60):

                    LOGGER.info(
                        'Time limit {} mins is over'.format(time_limit))
                    estop = True

                    break

                if verbose and len(log_entry) > 2:
                    log.log(log_entry)

            LOGGER.info('Completed epoch, seen {} samples'.format(n_samples))
            if total_nsamples == 0:
                total_nsamples = n_samples

            if estop:
                log.log(log_entry)
                break

        if best_model is not None:
            assert len(best_model) == 3
            best_p, best_state, best_uidx = best_model
            zipp(best_p, tparams)
            zipp(best_state, optimizer_state)
        '''
        use_noise.set_value(0.)
        LOGGER.info('Calculating validation cost')
        valid_errs = do_validation(f_log_probs, model_options, valid_stream)
        '''

        if not best_model:
            best_p = unzip(tparams)
            best_state = unzip(optimizer_state)
            best_uidx = [uidx]

        best_p = copy.copy(best_p)
        best_state = copy.copy(best_state)
        params_and_state = merge(best_p, best_state, {'uidx': best_uidx},
                                 {'best_valid_err': best_valid_err},
                                 {'total_nsamples': total_nsamples})
        save_params(params_and_state, best_filename)

    except Exception:
        LOGGER.error(traceback.format_exc())
        best_valid_err = -1.
    else:
        # XXX add something needed
        print('Training Done')

    return best_valid_err
Beispiel #19
0
Datei: nmt.py Projekt: caglar/nmt
def train(dim_word=100, # word vector dimensionality
          dim=1000, # the number of LSTM units
          encoder='gru',
          decoder='gru_cond',
          patience=10,
          max_epochs=5000,
          dispFreq=100,
          decay_c=0.,
          alpha_c=0.,
          diag_c=0.,
          clip_c=-1.,
          lrate=0.01,
          n_words_src=100000,
          n_words=100000,
          maxlen=100, # maximum length of the description
          optimizer='rmsprop',
          batch_size = 16,
          valid_batch_size = 16,
          saveto='model.npz',
          validFreq=1000,
          saveFreq=1000, # save the parameters after every saveFreq updates
          sampleFreq=100, # generate some samples after every sampleFreq updates
          datasets=['/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
                    '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'],
          valid_datasets=['../data/dev/newstest2011.en.tok', '../data/dev/newstest2011.fr.tok'],
          dictionaries=['/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
                        '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'],
          use_dropout=False,
          reload_=False):

    # Model options
    model_options = locals().copy()

    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = pkl.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            models_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(datasets[0], datasets[1],
                         dictionaries[0], dictionaries[1],
                         n_words_source=n_words_src, n_words_target=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_datasets[0], valid_datasets[1],
                         dictionaries[0], dictionaries[1],
                         n_words_source=n_words_src, n_words_target=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, use_noise, \
          x, x_mask, y, y_mask, \
          opt_ret, \
          cost = \
          build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    print 'Buliding sampler'
    f_init, f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * ((tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:,None]-
                                opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # after any regularizer
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'
    print 'Building f_grad...',
    f_grad = theano.function(inps, grads, profile=profile)
    print 'Done'

    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (clip_c**2),
                                           g / tensor.sqrt(g2) * clip_c,
                                           g))
        grads = new_grads

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0])/batch_size
    if saveFreq == -1:
        saveFreq = len(train[0])/batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0])/batch_size

    uidx = 0
    estop = False
    for eidx in xrange(max_epochs):
        n_samples = 0

        for x, y in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen,
                                                n_words_src=n_words_src,
                                                n_words=n_words)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()
            cost = f_grad_shared(x, x_mask, y, y_mask)
            f_update(lrate)
            ud = time.time() - ud_start

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)

                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl'%saveto, 'wb'))
                print 'Done'

            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(numpy.minimum(5,x.shape[1])):
                    stochastic = True
                    sample, score = gen_sample(tparams, f_init, f_next, x[:,jj][:,None],
                                               model_options, trng=trng, k=1, maxlen=30,
                                               stochastic=stochastic, argmax=False)
                    print 'Source ', jj, ': ',
                    for vv in x[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[0]:
                            print worddicts_r[0][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Truth ', jj, ' : ',
                    for vv in y[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            print worddicts_r[1][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Sample ', jj, ': ',
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            print worddicts_r[1][vv],
                        else:
                            print 'UNK',
                    print

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    import ipdb; ipdb.set_trace()

                print 'Valid ', valid_err

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean()
    print 'Valid ', valid_err
    params = copy.copy(best_p)
    numpy.savez(saveto, zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err
Beispiel #20
0
def train(dim_word=100,  # word vector dimensionality
          dim=1000,  # the number of GRU units
          encoder='gru',
          patience=10,  # early stopping patience
          max_epochs=5000,
          finish_after=10000000,  # finish after this many updates
          dispFreq=100,
          decay_c=0.,  # L2 weight decay penalty
          lrate=0.01,
          n_words=100000,  # vocabulary size
          maxlen=100,  # maximum length of the description
          optimizer='rmsprop',
          batch_size=16,
          valid_batch_size=16,
          saveto='model.npz',
          validFreq=1000,
          saveFreq=1000,  # save the parameters after every saveFreq updates
          sampleFreq=100,  # generate some samples after every sampleFreq
          dataset='/data/lisatmp4/anirudhg/wiki.tok.txt.gz',
          valid_dataset='/data/lisatmp4/anirudhg/newstest2011.en.tok',
          dictionary='/data/lisatmp4/anirudhg/wiki.tok.txt.gz.pkl',
          use_dropout=False,
          reload_=False):

    # Model options
    model_options = locals().copy()

    # load dictionary
    with open(dictionary, 'rb') as f:
        worddicts = pkl.load(f)

    # invert dictionary
    worddicts_r = dict()
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)

    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    # create shared variables for parameters
    tparams = init_tparams(params)

    # build the symbolic computational graph
    trng, use_noise, \
        x, x_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask]

    print 'Buliding sampler'
    f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams,
                                                             grads, inps, cost)

    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0])/batch_size
    if saveFreq == -1:
        saveFreq = len(train[0])/batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0])/batch_size

    # Training loop
    uidx = 0
    estop = False
    bad_counter = 0
    for eidx in xrange(max_epochs):
        n_samples = 0

        for x in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            # pad batch and create mask
            x, x_mask = prepare_data(x, maxlen=maxlen, n_words=n_words)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers
            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1.

            # verbose
            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            # save the best model so far
            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)
                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            # generate some samples with the model and display them
            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(5):
                    sample, score = gen_sample(tparams, f_next,
                                               model_options, trng=trng,
                                               maxlen=30, argmax=False)
                    print 'Sample ', jj, ': ',
                    ss = sample
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r:
                            print worddicts_r[vv],
                        else:
                            print 'UNK',
                    print

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= \
                        numpy.array(history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    ipdb.set_trace()

                print 'Valid ', valid_err

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data,
                           model_options, valid).mean()

    print 'Valid ', valid_err

    params = copy.copy(best_p)
    numpy.savez(saveto, zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err
def train(model_options,
        dataset_name = 'MSVD',
        cnn_name = 'ResNet50',
        train_data_ids_path = config.MSVD_DATA_IDS_TRAIN_PATH,
        val_data_ids_path = config.MSVD_DATA_IDS_VAL_PATH,
        test_data_ids_path = config.MSVD_DATA_IDS_TEST_PATH,
        vocab_path = config.MSVD_VOCAB_PATH,
        reverse_vocab_path = config.MSVD_REVERSE_VOCAB_PATH,
        mb_size_train = 64,
        mb_size_test = 128,
        train_caps_path = config.MSVD_VID_CAPS_TRAIN_PATH,
        val_caps_path = config.MSVD_VID_CAPS_VAL_PATH,
        test_caps_path = config.MSVD_VID_CAPS_TEST_PATH,
        feats_dir = config.MSVD_FEATS_DIR,
        save_dir = config.SAVE_DIR_PATH,
        word_dim = 512,   # word embeddings size
        ctx_dim = 2048,   # video cnn feature dimension
        lstm_dim = 512,   # lstm unit size
        patience = 20,
        max_epochs = 500,
        decay_c = 1e-4,
        alpha_entropy_r = 0.,
        alpha_c = 0.70602,
        clip_c = 10.,
        lrate = 0.0001,
        vocab_size = 20000, # n_words
        maxlen_caption = 30,  # max length of the descprition
        optimizer = 'adadelta',
        batch_size = 64,  # for trees use 25
        metric = 'everything',    # set to perplexity on DVS # blue, meteor, or both
        use_dropout = True,
        selector = True,
        ctx2out = True,
        prev2out = True,
        dispFreq = 10,
        validFreq = 2000,
        saveFreq = -1, # save the parameters after every saveFreq updates
        sampleFreq = 100, # generate some samples after every sampleFreq updates
        verbose = True,
        debug = False,
        reload_model = False,
        from_dir = '',
        ctx_frames = 28, # 26 when compare
        random_seed = 1234,
        beam_search = True
        ):

    tf.set_random_seed(random_seed)

    model = Model()

    print 'loading data'
    engine = data_engine.Movie2Caption(dataset_name,cnn_name,train_data_ids_path, val_data_ids_path, test_data_ids_path,
                vocab_path, reverse_vocab_path, mb_size_train, mb_size_test, maxlen_caption,
                train_caps_path, val_caps_path, test_caps_path, feats_dir)

    model_options['ctx_dim'] = engine.ctx_dim
    ctx_dim = engine.ctx_dim
    model_options['vocab_size'] = engine.vocab_size
    vocab_size = engine.vocab_size
    print 'n_words:', model_options['vocab_size']
    print 'ctx_dim:', model_options['ctx_dim']

    utils.write_to_json(model_options, '%smodel_options.json'%save_dir)

    # set test values, for debugging
    idx = engine.kf_train[0]
    x_tv, mask_tv, ctx_tv, ctx_mask_tv, ctx_pca_tv = data_engine.prepare_data(engine, [engine.train_data_ids[index] for index in idx], mode="train")

    print 'init params'
    t0 = time.time()
    params = model.init_params(model_options)

    k_centers = 3

    # description string: #words x #samples
    X = tf.placeholder(tf.int32, shape=(None, None), name='word_seq_x')  # word seq input (t,m)
    MASK = tf.placeholder(tf.float32, shape=(None, None), name='word_seq_mask')   # (t,m)
    # context: #samples x #annotations x dim
    CTX = tf.placeholder(tf.float32, shape=(None, ctx_frames, ctx_dim), name='ctx')
    CTX_MASK = tf.placeholder(tf.float32, shape=(None, ctx_frames), name='ctx_mask')
    CTX_PCA = tf.placeholder(tf.float32, shape=(None, k_centers, ctx_dim), name='ctx_pca')

    CTX_SAMPLER = tf.placeholder(tf.float32, shape=(ctx_frames, ctx_dim), name='ctx_sampler')
    CTX_MASK_SAMPLER = tf.placeholder(tf.float32, shape=(ctx_frames), name='ctx_mask_sampler')
    CTX_PCA_SAMPLER = tf.placeholder(tf.float32, shape=(k_centers, ctx_dim), name='ctx_pca_sampler')
    X_SAMPLER = tf.placeholder(tf.int32, shape=(None,), name='x_sampler')   # DOUBT 1 or None ?
    BO_INIT_STATE_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='bo_init_state_sampler')
    TO_INIT_STATE_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='to_init_state_sampler')
    BO_INIT_MEMORY_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='bo_init_memory_sampler')
    TO_INIT_MEMORY_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='to_init_memory_sampler')

    # create tensorflow variables
    print 'buliding model'
    tfparams = utils.init_tfparams(params)

    use_noise, COST, extra = model.build_model(tfparams, model_options, X, MASK, CTX, CTX_MASK, CTX_PCA)
    ALPHAS = extra[1]   # (t,64,28)
    BETAS = extra[2]    # (t,64)

    print 'buliding sampler'
    f_init, f_next = model.build_sampler(tfparams, model_options, use_noise,
                                CTX_SAMPLER, CTX_MASK_SAMPLER, CTX_PCA_SAMPLER, X_SAMPLER, BO_INIT_STATE_SAMPLER,
                                TO_INIT_STATE_SAMPLER, BO_INIT_MEMORY_SAMPLER, TO_INIT_MEMORY_SAMPLER)

    print 'building f_log_probs'
    f_log_probs = -COST

    print 'check trainables'
    wrt = utils.itemlist(tfparams, model_options)
    trainables = tf.trainable_variables()
    print len(wrt),len(trainables)
    # assert len(wrt)==len(trainables)

    COST = tf.reduce_mean(COST, name="LOSS")
    if decay_c > 0.:
        decay_c = tf.Variable(np.float32(decay_c), trainable=False, name='decay_c')
        weight_decay = 0.
        for vv in wrt:
            weight_decay += tf.reduce_sum(vv ** 2)
        weight_decay *= decay_c
        COST += weight_decay

    if alpha_c > 0.:
        alpha_c = tf.Variable(np.float32(alpha_c), trainable=False, name='alpha_c')
        alpha_reg = alpha_c * tf.reduce_mean(tf.reduce_sum(((1.-tf.reduce_sum(ALPHAS, axis=0))**2), axis=-1))
        COST += alpha_reg

    if alpha_entropy_r > 0:
        alpha_entropy_r = tf.Variable(np.float32(alpha_entropy_r),
                                        name='alpha_entropy_r')
        alpha_reg_2 = alpha_entropy_r * tf.reduce_mean(tf.reduce_sum((-tf.add(ALPHAS *
                    tf.log(ALPHAS+1e-8),axis=-1)), axis=-1))
        COST += alpha_reg_2
    else:
        alpha_reg_2 = tf.zeros_like(COST)

    print 'building f_alpha'
    f_alpha = [ALPHAS, BETAS]

    print 'build train fns'
    UPDATE_OPS = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(UPDATE_OPS):
        # optimizer = tf.train.AdadeltaOptimizer(learning_rate=1.0, rho=0.95, epsilon=1e-06).minimize(loss=COST, var_list=wrt)
        optimizer = tf.train.AdadeltaOptimizer(learning_rate=1.0, rho=0.95, epsilon=1e-06)
        # optimizer = tf.train.AdamOptimizer()
        gradients, variables = zip(*optimizer.compute_gradients(loss=COST, var_list=wrt))
        gradients, _ = tf.clip_by_global_norm(gradients, clip_c)
        capped_grads_and_vars = zip(gradients, variables)
        TRAIN_OP = optimizer.apply_gradients(capped_grads_and_vars)


    # Initialize all variables
    var_init = tf.global_variables_initializer()
    # Ops to save and restore all the variables.
    saver = tf.train.Saver()

    print 'compilation took %.4f sec'%(time.time()-t0)
    print 'Optimization'

    history_errs = []
    # reload history
    if reload_model:
        print 'loading history error...'
        history_errs = np.load(from_dir+'model_best_so_far.npz')['history_errs'].tolist()

    bad_counter = 0

    processes = None
    queue = None
    rqueue = None
    shared_params = None

    uidx = 0
    uidx_best_blue = 0
    uidx_best_valid_err = 0
    estop = False
    # best_p = utils.unzip(tparams)
    best_blue_valid = 0
    best_valid_err = 999
    alphas_ratio = []

    train_err = -1
    train_perp = -1
    valid_err = -1
    valid_perp = -1
    test_err = -1
    test_perp = -1

    # Launch the graph
    with tf.Session() as sess:
        sess.run(var_init)
        if reload_model:
            print 'restoring model...'
            saver.restore(sess, from_dir+"model_best_so_far.ckpt")
        for eidx in xrange(max_epochs):
            n_samples = 0
            train_costs = []
            grads_record = []
            for idx in engine.kf_train:
                tags = [engine.train_data_ids[index] for index in idx]
                n_samples += len(tags)
                uidx += 1
                
                sess.run(tf.assign(use_noise, True))

                pd_start = time.time()
                x, mask, ctx, ctx_mask, ctx_pca = data_engine.prepare_data(engine, tags, mode="train")
                pd_duration = time.time() - pd_start
                if x is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    continue

                # writer = tf.summary.FileWriter("graph_cost", sess.graph)
                cost, alphas, betas = sess.run([COST,ALPHAS,BETAS], feed_dict={
                                        X: x,
                                        MASK: mask,
                                        CTX: ctx,
                                        CTX_PCA: ctx_pca,
                                        CTX_MASK: ctx_mask})

                ud_start = time.time()
                sess.run(TRAIN_OP, feed_dict={
                                        X: x,
                                        MASK: mask,
                                        CTX: ctx,
                                        CTX_PCA: ctx_pca,
                                        CTX_MASK: ctx_mask})
                ud_duration = time.time() - ud_start

                # writer.close()
                if np.isnan(cost) or np.isinf(cost):
                    print 'NaN detected in cost'
                    import pdb; pdb.set_trace()
                
                if eidx == 0:
                    train_error = cost
                else:
                    train_error = train_error * 0.95 + cost * 0.05
                train_costs.append(cost)
                
                if np.mod(uidx, dispFreq) == 0:
                    print 'Epoch: ', eidx, \
                        ', Update: ', uidx, \
                        ', train cost mean so far: ', train_error, \
                        ', fetching data time spent (sec): ', pd_duration, \
                        ', update time spent (sec): ', ud_duration, \
                        ', save_dir: ', save_dir, '\n'
                    
                    alphas, betas = sess.run(f_alpha, feed_dict={
                                            X: x,
                                            MASK: mask,
                                            CTX: ctx,
                                            CTX_PCA: ctx_pca,
                                            CTX_MASK: ctx_mask})
                    counts = mask.sum(0)
                    betas_mean = (betas * mask).sum(0) / counts
                    betas_mean = betas_mean.mean()
                    print 'alpha ratio %.3f, betas mean %.3f\n'%(
                        alphas.min(-1).mean() / (alphas.max(-1)).mean(), betas_mean)
                    l = 0
                    for vv in x[:, 0]:
                        if vv == 0: # eos
                            break
                        if vv in engine.reverse_vocab:
                            print '(', np.round(betas[l, 0], 3), ')', engine.reverse_vocab[vv],
                        else:
                            print '(', np.round(betas[l, 0], 3), ')', 'UNK',
                        print ",",
                        l += 1
                    print '(', np.round(betas[l, 0], 3), ')\n'

                if np.mod(uidx, saveFreq) == 0:
                    pass

                if np.mod(uidx, sampleFreq) == 0:
                    sess.run(tf.assign(use_noise, False))
                    print '------------- sampling from train ----------'
                    x_s = x     # (t,m)
                    mask_s = mask   # (t,m)
                    ctx_s = ctx     # (m,28,2048)
                    ctx_mask_s = ctx_mask   # (m,28)
                    ctx_pca_s = ctx_pca
                    model.sample_execute(sess, engine, model_options, tfparams, f_init, f_next, x_s, ctx_s, ctx_mask_s, ctx_pca_s)
                    # print '------------- sampling from valid ----------'
                    # idx = engine.kf_val[np.random.randint(1, len(engine.kf_val) - 1)]
                    # tags = [engine.val_data_ids[index] for index in idx]
                    # x_s, mask_s, ctx_s, mask_ctx_s, ctx_pca_s = data_engine.prepare_data(engine, tags,"val")
                    # model.sample_execute(sess, engine, model_options, tfparams, f_init, f_next, x_s, ctx_s, ctx_mask_s, ctx_pca_s)
                    # print ""

                if validFreq != -1 and np.mod(uidx, validFreq) == 0:
                    t0_valid = time.time()
                    alphas, _ = sess.run(f_alpha, feed_dict={
                                            X: x,
                                            MASK: mask,
                                            CTX: ctx,
                                            CTX_PCA: ctx_pca,
                                            CTX_MASK: ctx_mask})
                    ratio = alphas.min(-1).mean()/(alphas.max(-1)).mean()
                    alphas_ratio.append(ratio)
                    np.savetxt(save_dir+'alpha_ratio.txt',alphas_ratio)

                    np.savez(save_dir+'model_current.npz', history_errs=history_errs)
                    saver.save(sess, save_dir+'model_current.ckpt')

                    sess.run(tf.assign(use_noise, False))

                    train_err = -1
                    train_perp = -1
                    valid_err = -1
                    valid_perp = -1
                    test_err = -1
                    test_perp = -1
                    if not debug:
                        # first compute train cost
                        if 0:
                            print 'computing cost on trainset'
                            train_err, train_perp = model.pred_probs(sess, engine, 'train', 
                                    f_log_probs, verbose=model_options['verbose'])
                        else:
                            train_err = 0.
                            train_perp = 0.
                        if 1:
                            print 'validating...'
                            valid_err, valid_perp = model.pred_probs(sess, engine, 'val',
                                    f_log_probs, verbose=model_options['verbose'])
                        else:
                            valid_err = 0.
                            valid_perp = 0.
                        if 0:
                            print 'testing...'
                            test_err, test_perp = model.pred_probs(sess, engine, 'test',
                                    f_log_probs, verbose=model_options['verbose'])
                        else:
                            test_err = 0.
                            test_perp = 0.
                    
                    mean_ranking = 0
                    blue_t0 = time.time()
                    scores, processes, queue, rqueue, shared_params = \
                        metrics.compute_score(sess=sess,
                        model_type='attention',
                        model_archive=None,
                        options=model_options,
                        engine=engine,
                        save_dir=save_dir,
                        beam=5, n_process=5,
                        whichset='both',
                        on_cpu=False,   
                        processes=processes, queue=queue, rqueue=rqueue,
                        shared_params=shared_params, metric=metric,
                        one_time=False,
                        f_init=f_init, f_next=f_next, model=model
                        )
                    '''
                     {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]},
                     'alternative_valid': {'Bleu_3': 0.40702270203174923,
                     'Bleu_4': 0.29276570520368456,
                     'CIDEr': 0.25247168210607884,
                     'Bleu_2': 0.529069629270047,
                     'Bleu_1': 0.6804308797115253,
                     'ROUGE_L': 0.51083584331688392},
                     'meteor': {'test': [-1], 'valid': [0.282787550236724]}}
                    '''
                    valid_B1 = scores['valid']['Bleu_1']
                    valid_B2 = scores['valid']['Bleu_2']
                    valid_B3 = scores['valid']['Bleu_3']
                    valid_B4 = scores['valid']['Bleu_4']
                    valid_Rouge = scores['valid']['ROUGE_L']
                    valid_Cider = scores['valid']['CIDEr']
                    valid_meteor = scores['valid']['METEOR']
                    test_B1 = scores['test']['Bleu_1']
                    test_B2 = scores['test']['Bleu_2']
                    test_B3 = scores['test']['Bleu_3']
                    test_B4 = scores['test']['Bleu_4']
                    test_Rouge = scores['test']['ROUGE_L']
                    test_Cider = scores['test']['CIDEr']
                    test_meteor = scores['test']['METEOR']
                    print 'computing meteor/blue score used %.4f sec, '\
                      'blue score: %.1f, meteor score: %.1f'%(
                    time.time()-blue_t0, valid_B4, valid_meteor)
                    history_errs.append([eidx, uidx, train_err, train_perp,
                                         valid_perp, test_perp,
                                         valid_err, test_err,
                                         valid_B1, valid_B2, valid_B3,
                                         valid_B4, valid_meteor, valid_Rouge, valid_Cider,
                                         test_B1, test_B2, test_B3,
                                         test_B4, test_meteor, test_Rouge, test_Cider])
                    np.savetxt(save_dir+'train_valid_test.txt',
                                  history_errs, fmt='%.3f')
                    print 'save validation results to %s'%save_dir
                    # save best model according to the best blue or meteor
                    if len(history_errs) > 1 and \
                      valid_B4 > np.array(history_errs)[:-1,11].max():
                        print 'Saving to %s...'%save_dir,
                        np.savez(
                            save_dir+'model_best_blue_or_meteor.npz',
                            history_errs=history_errs)
                        saver.save(sess, save_dir+'model_best_blue_or_meteor.ckpt') # DOUBT
                    if len(history_errs) > 1 and \
                      valid_err < np.array(history_errs)[:-1,6].min():
                        # best_p = utils.unzip(tparams) # DOUBT
                        bad_counter = 0
                        best_valid_err = valid_err
                        uidx_best_valid_err = uidx

                        print 'Saving to %s...'%save_dir,
                        np.savez(save_dir+'model_best_so_far.npz',
                                history_errs=history_errs)
                        saver.save(sess, save_dir+'model_best_so_far.ckpt')
                        utils.write_to_json(model_options, '%smodel_options.json'%save_dir)
                        print 'Done'
                    elif len(history_errs) > 1 and \
                        valid_err >= np.array(history_errs)[:-1,6].min():
                        bad_counter += 1
                        print 'history best ',np.array(history_errs)[:,6].min()
                        print 'bad_counter ',bad_counter
                        print 'patience ',patience
                        if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

                    if test_B4>0.52 and test_meteor>0.32:
                        print 'Saving to %s...'%save_dir,
                        np.savez(
                            save_dir+'model_'+str(uidx)+'.npz',
                            history_errs=history_errs)
                        saver.save(sess, save_dir+'model_'+str(uidx)+'.ckpt')

                    print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \
                      'best valid err so far',best_valid_err
                    print 'valid took %.2f sec'%(time.time() - t0_valid)
                    # end of validatioin
                if debug:
                    break

            if estop:
                break
            if debug:
                break

            # end for loop over minibatches
            print 'This epoch has seen %d samples, train cost %.2f'%(
                n_samples, np.mean(train_costs))

        # end for loop over epochs
        print 'Optimization ended.'
        
        print 'stopped at epoch %d, minibatch %d, '\
          'curent Train %.2f, current Valid %.2f, current Test %.2f '%(
              eidx,uidx,np.mean(train_err),np.mean(valid_err),np.mean(test_err))
        
        if history_errs != []:
            history = np.asarray(history_errs)
            best_valid_idx = history[:,6].argmin()
            np.savetxt(save_dir+'train_valid_test.txt', history, fmt='%.4f')
            print 'final best exp ', history[best_valid_idx]

        np.savez(
            save_dir+'model_train_end.npz',
            history_errs=history_errs)
        saver.save(sess, save_dir+'model_train_end.ckpt')
    return
Beispiel #22
0
def train(
        dim_word=100,  # word vector dimensionality
        dim=1000,  # the number of GRU units
        encoder='gru',
        patience=10,  # early stopping patience
        max_epochs=5000,
        finish_after=10000000,  # finish after this many updates
        dispFreq=100,
        decay_c=0.,  # L2 weight decay penalty
        lrate=0.01,
        n_words=100000,  # vocabulary size
        maxlen=100,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=100,  # generate some samples after every sampleFreq
        dataset='/data/lisatmp4/anirudhg/wiki.tok.txt.gz',
        valid_dataset='/data/lisatmp4/anirudhg/newstest2011.en.tok',
        dictionary='/data/lisatmp4/anirudhg/wiki.tok.txt.gz.pkl',
        use_dropout=False,
        reload_=False):

    # Model options
    model_options = locals().copy()

    # load dictionary
    with open(dictionary, 'rb') as f:
        worddicts = pkl.load(f)

    # invert dictionary
    worddicts_r = dict()
    for kk, vv in worddicts.iteritems():
        worddicts_r[vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            model_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_dataset,
                         dictionary,
                         n_words_source=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)

    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    # create shared variables for parameters
    tparams = init_tparams(params)

    # build the symbolic computational graph
    trng, use_noise, \
        x, x_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask]

    print 'Buliding sampler'
    f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams,
                                                             grads, inps, cost)

    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0]) / batch_size

    f = open('workfile', 'w')
    # Training loop
    # generate some samples with the model and display them
    for jj in xrange(5000):
        sample, score = gen_sample(tparams,
                                   f_next,
                                   model_options,
                                   trng=trng,
                                   maxlen=30,
                                   argmax=False)
        print 'Sample ', jj, ': ',
        ss = sample
        for vv in ss:
            if vv == 0:
                break
            if vv in worddicts_r:
                print worddicts_r[vv],
                f.write(worddicts_r[vv])
                f.write(' ')
            else:
                print 'UNK',
                print
                f.write('UNK')
                f.write(' ')

        print '\n'
        f.write('\n')

    f.close()
Beispiel #23
0
def train(dim_word=100,  # word vector dimensionality
          dim=1000,  # the number of LSTM units
          encoder='gru',
          decoder='gru_cond',
          n_words_src=30000,
          n_words=30000,
          patience=10,  # early stopping patience
          max_epochs=5000,
          finish_after=10000000,  # finish after this many updates
          dispFreq=100,
          decay_c=0.,  # L2 regularization penalty
          alpha_c=0.,  # alignment regularization
          clip_c=-1.,  # gradient clipping threshold
          lrate=1.,  # learning rate
          maxlen=100,  # maximum length of the description
          optimizer='rmsprop',
          batch_size=16,
          saveto='model.npz',
          saveFreq=1000,  # save the parameters after every saveFreq updates
          datasets=[
              '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
              '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'],
          picked_train_idxes_file=r'',
          use_dropout=False,
          reload_=False,
          overwrite=False,
          preload='',
          sort_by_len=False,
          convert_embedding=True,
          dump_before_train=False,
    ):
    # Model options
    model_options = locals().copy()
    if reload_:
        lrate *= 0.5

    # load dictionaries and invert them

    # reload options
    if reload_ and os.path.exists(preload):
        print 'Reloading model options'
        with open(r'.\model\en2fr.iter160000.npz.pkl', 'rb') as f:
            model_options = pkl.load(f)

    print 'Configuration from fy'

    vocab_en_filename = './data/dic/en2fr_en_vocabs_top1M.pkl'
    vocab_fr_filename = './data/dic/en2fr_fr_vocabs_top1M.pkl'
    map_filename = './data/dic/mapFullVocab2Top1MVocab.pkl'
    lr_discount_freq = 80000

    print 'Done'

    print 'Loading data'

    text_iterator = TextIterator(
        datasets[0],
        datasets[1],
        vocab_en_filename,
        vocab_fr_filename,
        batch_size,
        maxlen,
        n_words_src,
        n_words,
    )

    # sys.stdout.flush()
    # train_data_x = pkl.load(open(datasets[0], 'rb'))
    # train_data_y = pkl.load(open(datasets[1], 'rb'))
    #
    # if len(picked_train_idxes_file) != 0:
    #     picked_idxes = pkl.load(open(picked_train_idxes_file, 'rb'))
    #     train_data_x = [train_data_x[id] for id in picked_idxes]
    #     train_data_y = [train_data_y[id] for id in picked_idxes]
    #
    # print 'Total train:', len(train_data_x)
    # print 'Max len:', max([len(x) for x in train_data_x])
    # sys.stdout.flush()
    #
    # if sort_by_len:
    #     slen = np.array([len(s) for s in train_data_x])
    #     sidx = slen.argsort()
    #
    #     _sbuf = [train_data_x[i] for i in sidx]
    #     _tbuf = [train_data_y[i] for i in sidx]
    #
    #     train_data_x = _sbuf
    #     train_data_y = _tbuf
    #     print len(train_data_x[0]), len(train_data_x[-1])
    #     sys.stdout.flush()
    #     train_batch_idx = get_minibatches_idx(len(train_data_x), batch_size, shuffle=False)
    # else:
    #     train_batch_idx = get_minibatches_idx(len(train_data_x), batch_size, shuffle=True)

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(preload):
        print 'Reloading model parameters'
        params = load_params(preload, params)

        # for k, v in params.iteritems():
        #     print '>', k, v.shape, v.dtype

        # Only convert parameters when reloading
        if convert_embedding:
            # =================
            # Convert input and output embedding parameters with a exist word embedding
            # =================
            print 'Convert input and output embedding'

            temp_Wemb = params['Wemb']
            orig_emb_mean = np.mean(temp_Wemb, axis=0)

            params['Wemb'] = np.tile(orig_emb_mean, [params['Wemb'].shape[0], 1])

            # Load vocabulary map dicts and do mapping
            with open(map_filename, 'rb') as map_file:
                map_en = pkl.load(map_file)
                map_fr = pkl.load(map_file)

            for full, top in map_en.iteritems():
                emb_size = temp_Wemb.shape[0]
                if full < emb_size and top < emb_size:
                    params['Wemb'][top] = temp_Wemb[full]

            print 'Convert input embedding done'

            temp_ff_logit_W = params['ff_logit_W']
            temp_Wemb_dec = params['Wemb_dec']
            temp_b = params['ff_logit_b']

            orig_ff_logit_W_mean = np.mean(temp_ff_logit_W, axis=1)
            orig_Wemb_dec_mean = np.mean(temp_Wemb_dec, axis=0)
            orig_b_mean = np.mean(temp_b)

            params['ff_logit_W'] = np.tile(orig_ff_logit_W_mean, [params['ff_logit_W'].shape[1], 1]).T
            params['ff_logit_b'].fill(orig_b_mean)
            params['Wemb_dec'] = np.tile(orig_Wemb_dec_mean, [params['Wemb_dec'].shape[0], 1])

            for full, top in map_en.iteritems():
                emb_size = temp_Wemb.shape[0]
                if full < emb_size and top < emb_size:
                    params['ff_logit_W'][:, top] = temp_ff_logit_W[:, full]
                    params['ff_logit_b'][top] = temp_b[full]
                    params['Wemb_dec'][top] = temp_Wemb[full]

            print 'Convert output embedding done'

            # for k, v in params.iteritems():
            #     print '>', k, v.shape, v.dtype

            # ================
            # End Convert
            # ================

    tparams = init_tparams(params)

    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost, x_emb = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    print 'Building sampler'
    f_init, f_next = build_sampler(tparams, model_options, trng, use_noise)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    f_x_emb = theano.function([x, x_mask], x_emb, profile=profile)
    print 'Done'
    sys.stdout.flush()
    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(np.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(np.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0)) ** 2).sum(1).mean()
        cost += alpha_reg

    # after all regularizers - compile the computational graph for cost
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'
    sys.stdout.flush()
    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g ** 2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (clip_c ** 2),
                                           g / tensor.sqrt(g2) * clip_c,
                                           g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print 'Done'

    print 'Optimization'

    best_p = None
    bad_counter = 0
    uidx = 0
    if reload_:
        m = re.search('.+iter(\d+?)\.npz', preload)
        if m:
            uidx = int(m.group(1))
    print 'uidx', uidx, 'l_rate', lrate

    estop = False
    history_errs = []
    # reload history

    if dump_before_train:
        print 'Dumping before train...',
        saveto_uidx = '{}.iter{}.npz'.format(
            os.path.splitext(saveto)[0], uidx)
        np.savez(saveto_uidx, history_errs=history_errs,
                 uidx=uidx, **unzip(tparams))
        print 'Done'

    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    for eidx in xrange(max_epochs):
        n_samples = 0

        # for i, batch_idx in train_batch_idx:
        #
        #     x = [train_data_x[id] for id in batch_idx]
        #     y = [train_data_y[id] for id in batch_idx]

        for i, (x, y) in enumerate(text_iterator):
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            x, x_mask, y, y_mask = prepare_data(x, y)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()

            # compute cost, grads and copy grads to shared variables
            cost = f_grad_shared(x, x_mask, y, y_mask)

            # do the update on parameters
            f_update(lrate)

            ud = time.time() - ud_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if np.isnan(cost) or np.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            # discount reward
            if lr_discount_freq > 0 and np.mod(uidx, lr_discount_freq) == 0:
                lrate *= 0.5
                print 'Discount learning rate to {} at iteration {}'.format(lrate, uidx)

            # verbose
            if np.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud
                sys.stdout.flush()

            if np.mod(uidx, saveFreq) == 0:
                # save with uidx
                if not overwrite:
                    # print 'Saving the model at iteration {}...'.format(uidx),
                    saveto_uidx = '{}.iter{}.npz'.format(
                            os.path.splitext(saveto)[0], uidx)
                    np.savez(saveto_uidx, history_errs=history_errs,
                             uidx=uidx, **unzip(tparams))
                    # print 'Done'
                    # sys.stdout.flush()
            # generate some samples with the model and display them

            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after %d iterations!' % uidx
                estop = True
                break

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)

    return 0.
Beispiel #24
0
def train(worker, model_options, data_options,
          patience,  # early stopping patience
          max_epochs,
          finish_after,  # finish after this many updates
          decay_c,  # L2 regularization penalty
          alpha_c,  # alignment regularization
          clip_c,  # gradient clipping threshold
          lrate,  # learning rate
          optimizer,
          saveto,
          valid_freq,
          train_len,
          valid_sync,
          save_freq,   # save the parameters after every saveFreq updates
          sample_freq,   # generate some samples after every sampleFreq
          control_port,
          batch_port,
          log_port,
          reload_):

    LOGGER.info('Connecting to data socket ({}) and loading validation data'
                .format(batch_port))
    worker.init_mb_sock(batch_port)
    _, _, valid_stream = load_data(**data_options)

    LOGGER.info('Building model')
    params = init_params(model_options)
    # reload parameters
    experiment_id = worker.send_req('experiment_id')
    model_filename = '{}.model.npz'.format(experiment_id)
    saveto_filename = '{}.npz'.format(saveto)
    if reload_ and os.path.exists(saveto_filename):
        LOGGER.info('Loading parameters from {}'.format(saveto_filename))
        params = load_params(saveto_filename, params)

    LOGGER.info('Initializing parameters')
    tparams = init_tparams(params)
    alpha = worker.send_req('alpha')
    worker.init_shared_params(tparams.values(), param_sync_rule=EASGD(alpha))

    # use_noise is for dropout
    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    LOGGER.info('Building sampler')
    f_init, f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    LOGGER.info('Building f_log_probs')
    f_log_probs = theano.function(inps, cost, profile=False)

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in six.iteritems(tparams):
            weight_decay += (vv ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * ((tensor.cast(
            y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
            opt_ret['dec_alphas'].sum(0)) ** 2).sum(1).mean()
        cost += alpha_reg

    # Not used?
    # after all regularizers - compile the computational graph for cost
    # LOGGER.info('Building f_cost')
    # f_cost = theano.function(inps, cost, profile=False)

    LOGGER.info('Computing gradient')
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g ** 2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(tensor.switch(g2 > (clip_c ** 2), g / tensor.sqrt(
                g2) * clip_c, g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    LOGGER.info('Building optimizers')
    f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams,
                                                             grads, inps, cost)

    LOGGER.info('Optimization')

    log = RemoteLogger(port=log_port)
    train_start = time.clock()
    best_p = None

    # Making sure that the worker start training with the most recent params
    worker.copy_to_local()

    uidx = 0
    while True:
        step = worker.send_req('next')
        LOGGER.debug('Received command: {}'.format(step))
        if step == 'train':
            use_noise.set_value(1.)
            for i in xrange(train_len):
                x, x_mask, y, y_mask = worker.recv_mb()

                uidx += 1
                log_entry = {'iteration': uidx}

                # compute cost, grads and copy grads to shared variables
                update_start = time.clock()
                cost = f_grad_shared(x, x_mask, y, y_mask)
                f_update(lrate)

                log_entry['cost'] = float(cost)
                log_entry['average_source_length'] = \
                    float(x_mask.sum(0).mean())
                log_entry['average_target_length'] = \
                    float(y_mask.sum(0).mean())
                log_entry['update_time'] = time.clock() - update_start
                log_entry['train_time'] = time.clock() - train_start
                log_entry['time'] = time.time()
                log.log(log_entry)

            step = worker.send_req({'done': train_len})
            LOGGER.debug("Syncing with global params")
            worker.sync_params(synchronous=True)

        if step == 'valid':
            if valid_sync:
                worker.copy_to_local()
            use_noise.set_value(0.)
            valid_errs = pred_probs(f_log_probs, model_options, valid_stream)
            valid_err = float(valid_errs.mean())
            res = worker.send_req({'valid_err': valid_err})
            log.log({'validation_cost': valid_err,
                     'train_time': time.clock() - train_start,
                     'time': time.time()})

            if res == 'best' and saveto:
                best_p = unzip(tparams)
                save_params(best_p, model_filename, saveto_filename)

            if valid_sync:
                worker.copy_to_local()

        if step == 'stop':
            break

    # Release all shared ressources.
    worker.close()
def train(dim_word_desc=400,# word vector dimensionality
          dim_word_q=400,
          dim_word_ans=600,
          dim_proj=300,
          dim=400,# the number of LSTM units
          encoder_desc='lstm',
          encoder_desc_word='lstm',
          encoder_desc_sent='lstm',
          use_dq_sims=False,
          eyem=None,
          learn_h0=False,
          use_desc_skip_c_g=False,
          debug=False,
          encoder_q='lstm',
          patience=10,
          max_epochs=5000,
          dispFreq=100,
          decay_c=0.,
          alpha_c=0.,
          clip_c=-1.,
          lrate=0.01,
          n_words_q=49145,
          n_words_desc=115425,
          n_words_ans=409,
          pkl_train_files=None,
          pkl_valid_files=None,
          maxlen=2000, # maximum length of the description
          optimizer='rmsprop',
          batch_size=2,
          vocab=None,
          valid_batch_size=16,
          use_elu_g=False,
          saveto='model.npz',
          model_dir=None,
          ms_nlayers=3,
          validFreq=1000,
          saveFreq=1000, # save the parameters after every saveFreq updates
          datasets=[None],
          truncate=400,
          momentum=0.9,
          use_bidir=False,
          cost_mask=None,
          valid_datasets=['/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5',
                          '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5'],
          dropout_rate=0.5,
          use_dropout=True,
          reload_=True,
          **opt_ds):

    ensure_dir_exists(model_dir)
    mpath = os.path.join(model_dir, saveto)
    mpath_best = os.path.join(model_dir, prfx("best", saveto))
    mpath_last = os.path.join(model_dir, prfx("last", saveto))
    mpath_stats = os.path.join(model_dir, prfx("stats", saveto))

    # Model options
    model_options = locals().copy()
    model_options['use_sent_reps'] = opt_ds['use_sent_reps']
    stats = defaultdict(list)

    del model_options['eyem']
    del model_options['cost_mask']

    if cost_mask is not None:
        cost_mask = sharedX(cost_mask)

    # reload options and parameters
    if reload_:
        print "Reloading the model."
        if os.path.exists(mpath_best):
            print "Reloading the best model from %s." % mpath_best
            with open(os.path.join(mpath_best, '%s.pkl' % mpath_best), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath_best, params)
        elif os.path.exists(mpath):
            print "Reloading the model from %s." % mpath
            with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath, params)
        else:
            raise IOError("Couldn't open the file.")
    else:
        print "Couldn't reload the models initializing from scratch."
        params = init_params(model_options)

    if datasets[0]:
        print "Short dataset", datasets[0]

    print 'Loading data'
    print 'Building model'
    if pkl_train_files is None or pkl_valid_files is None:
        train, valid, test = load_data(path=datasets[0],
                                       valid_path=valid_datasets[0],
                                       test_path=valid_datasets[1],
                                       batch_size=batch_size,
                                       **opt_ds)
    else:
        train, valid, test = load_pkl_data(train_file_paths=pkl_train_files,
                                           valid_file_paths=pkl_valid_files,
                                           batch_size=batch_size,
                                           vocab=vocab,
                                           eyem=eyem,
                                           **opt_ds)

    tparams = init_tparams(params)
    trng, use_noise, inps_d, \
                     opt_ret, \
                     cost, errors, ent_errors, ent_derrors, probs = \
                        build_model(tparams,
                                    model_options,
                                    prepare_data if not opt_ds['use_sent_reps'] \
                                            else prepare_data_sents,
                                    valid,
                                    cost_mask=cost_mask)

    alphas = opt_ret['dec_alphas']

    if opt_ds['use_sent_reps']:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'],
                inps_d['slen'], inps_d['qlen'],\
                inps_d['ent_mask']
                ]
    else:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'], \
                inps_d['qlen'], \
                inps_d['ent_mask']]

    outs = [cost, errors, probs, alphas]
    if ent_errors:
        outs += [ent_errors]

    if ent_derrors:
        outs += [ent_derrors]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, outs, profile=profile)
    print 'Done'

    # Apply weight decay on the feed-forward connections
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.

        for kk, vv in tparams.iteritems():
            if "logit" in kk or "ff" in kk:
                weight_decay += (vv ** 2).sum()

        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print 'Computing gradient...',
    grads = safe_grad(cost, itemlist(tparams))
    print 'Done'

    # Gradient clipping:
    if clip_c > 0.:
        g2 = get_norms(grads)
        for p, g in grads.iteritems():
            grads[p] = tensor.switch(g2 > (clip_c**2),
                                     (g / tensor.sqrt(g2 + 1e-8)) * clip_c,
                                     g)
    inps.pop()
    if optimizer.lower() == "adasecant":
        learning_rule = Adasecant(delta_clip=25.0,
                                  use_adagrad=True,
                                  grad_clip=0.25,
                                  gamma_clip=0.)
    elif optimizer.lower() == "rmsprop":
        learning_rule = RMSPropMomentum(init_momentum=momentum)
    elif optimizer.lower() == "adam":
        learning_rule = Adam()
    elif optimizer.lower() == "adadelta":
        learning_rule = AdaDelta()

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    learning_rule = None

    if learning_rule:
        f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr,
                                                          grads=grads,
                                                          inp=inps,
                                                          cost=cost,
                                                          errors=errors)
    else:
        f_grad_shared, f_update = eval(optimizer)(lr,
                                                  tparams,
                                                  grads,
                                                  inps,
                                                  cost,
                                                  errors)

    print 'Done'
    print 'Optimization'
    history_errs = []
    # reload history
    if reload_ and os.path.exists(mpath):
        history_errs = list(numpy.load(mpath)['history_errs'])

    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size

    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    best_found = False
    uidx = 0
    estop = False

    train_cost_ave, train_err_ave, \
            train_gnorm_ave = reset_train_vals()

    for eidx in xrange(max_epochs):
        n_samples = 0

        if train.done:
            train.reset()

        for d_, q_, a, em in train:
            n_samples += len(a)
            uidx += 1
            use_noise.set_value(1.)

            if opt_ds['use_sent_reps']:
                # To mask the description and the question.
                d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents(d_,
                                                                            q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(d,
                                                           d_mask,
                                                           q,
                                                           q_mask,
                                                           a,
                                                           dlen,
                                                           slen,
                                                           qlen)
            else:
                d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(d, d_mask,
                                                           q, q_mask,
                                                           a,
                                                           dlen,
                                                           qlen)

            upnorm = f_update(lrate)
            ud = time.time() - ud_start

            # Collect the running ave train stats.
            train_cost_ave = running_ave(train_cost_ave,
                                         cost)
            train_err_ave = running_ave(train_err_ave,
                                        errors)
            train_gnorm_ave = running_ave(train_gnorm_ave,
                                          gnorm)

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                import ipdb; ipdb.set_trace()

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, ' Update ', uidx, \
                        ' Cost ', cost, ' UD ', ud, \
                        ' UpNorm ', upnorm[0].tolist(), \
                        ' GNorm ', gnorm, \
                        ' Pnorm ', pnorm, 'Terrors ', errors

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',
                if best_p is not None and best_found:
                    numpy.savez(mpath_best, history_errs=history_errs, **best_p)
                    pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb'))
                else:
                    params = unzip(tparams)

                numpy.savez(mpath, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % mpath, 'wb'))
                pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb'))

                print 'Done'
                print_param_norms(tparams)

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                if valid.done:
                    valid.reset()

                valid_costs, valid_errs, valid_probs, \
                        valid_alphas, error_ent, error_dent = eval_model(f_log_probs,
                                                  prepare_data if not opt_ds['use_sent_reps'] \
                                                    else prepare_data_sents,
                                                  model_options,
                                                  valid,
                                                  use_sent_rep=opt_ds['use_sent_reps'])

                valid_alphas_ = numpy.concatenate([va.argmax(0) for va  in valid_alphas.tolist()], axis=0)
                valid_err = valid_errs.mean()
                valid_cost = valid_costs.mean()
                valid_alpha_ent = -negentropy(valid_alphas)

                mean_valid_alphas = valid_alphas_.mean()
                std_valid_alphas = valid_alphas_.std()

                mean_valid_probs = valid_probs.argmax(1).mean()
                std_valid_probs = valid_probs.argmax(1).std()

                history_errs.append([valid_cost, valid_err])

                stats['train_err_ave'].append(train_err_ave)
                stats['train_cost_ave'].append(train_cost_ave)
                stats['train_gnorm_ave'].append(train_gnorm_ave)

                stats['valid_errs'].append(valid_err)
                stats['valid_costs'].append(valid_cost)
                stats['valid_err_ent'].append(error_ent)
                stats['valid_err_desc_ent'].append(error_dent)

                stats['valid_alphas_mean'].append(mean_valid_alphas)
                stats['valid_alphas_std'].append(std_valid_alphas)
                stats['valid_alphas_ent'].append(valid_alpha_ent)

                stats['valid_probs_mean'].append(mean_valid_probs)
                stats['valid_probs_std'].append(std_valid_probs)

                if uidx == 0 or valid_err <= numpy.array(history_errs)[:, 1].min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                    best_found = True
                else:
                    bst_found = False

                if numpy.isnan(valid_err):
                    import ipdb; ipdb.set_trace()


                print "============================"
                print '\t>>>Valid error: ', valid_err, \
                        ' Valid cost: ', valid_cost
                print '\t>>>Valid pred mean: ', mean_valid_probs, \
                        ' Valid pred std: ', std_valid_probs
                print '\t>>>Valid alphas mean: ', mean_valid_alphas, \
                        ' Valid alphas std: ', std_valid_alphas, \
                        ' Valid alpha negent: ', valid_alpha_ent, \
                        ' Valid error ent: ', error_ent, \
                        ' Valid error desc ent: ', error_dent

                print "============================"
                print "Running average train stats "
                print '\t>>>Train error: ', train_err_ave, \
                        ' Train cost: ', train_cost_ave, \
                        ' Train grad norm: ', train_gnorm_ave
                print "============================"


                train_cost_ave, train_err_ave, \
                    train_gnorm_ave = reset_train_vals()


        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid.reset()
    valid_cost, valid_error, valid_probs, \
            valid_alphas, error_ent = eval_model(f_log_probs,
                                      prepare_data if not opt_ds['use_sent_reps'] \
                                           else prepare_data_sents,
                                      model_options, valid,
                                      use_sent_rep=opt_ds['use_sent_rep'])

    print " Final eval resuts: "
    print 'Valid error: ', valid_error.mean()
    print 'Valid cost: ', valid_cost.mean()
    print '\t>>>Valid pred mean: ', valid_probs.mean(), \
            ' Valid pred std: ', valid_probs.std(), \
            ' Valid error ent: ', error_ent

    params = copy.copy(best_p)

    numpy.savez(mpath_last,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err, valid_cost
def train(options, data, load_params=False, start_epoc=0):
    print "OPTIONS: ", options
    print 'Setting up model with options:'
    options = set_defaults(options)
    for kk, vv in options.iteritems():
        print kk, vv
    print "model seed: ", options['model_seed']
    print "fold: ", options['fold']
    print 'seed: ', options['seed']
    rng = numpy.random.RandomState(options['model_seed'] +
                                   100 * options.get('fold', 99) +
                                   options.get('seed', 99))
    params, operators = init_params(options, rng)
    print 'done...'

    if load_params:
        loaded = load_par(options)
        start_epoc = resume_epoc(options)
        # Check that we've loaded the correct parameters...
        for kk, vv in loaded.iteritems():
            assert params[kk].shape == vv.shape
            assert type(params[kk]) == type(vv)
        params = loaded

    tparams = init_tparams(params)

    trng, use_noise, inps, out = build_model(tparams, options, rng)
    y = tensor.imatrix('y')
    cost = nll(out, y)

    f_eval = theano.function([inps, y],
                             cost,
                             givens={use_noise: numpy.float32(0.)},
                             on_unused_input='ignore')

    reg = 0.
    for k, v in tparams.iteritems():
        if k[:6] == 'hidden' or k[-3:] == 'W_h':
            reg += options['l1'] * tensor.sum(abs(v))
            reg += options['l2'] * tensor.sum((v)**2)

    cost += reg

    grads = tensor.grad(cost, wrt=itemlist(tparams))
    lr = tensor.scalar(name='lr', dtype=theano.config.floatX)
    opt = get_optim(options['opt'])
    print 'Compiling functions'
    f_grad_shared, f_update, gshared = opt(lr, tparams, grads, [inps, y], cost,
                                           use_noise)
    f_out = theano.function([inps],
                            out,
                            givens={use_noise: numpy.float32(0.)},
                            on_unused_input='ignore',
                            allow_input_downcast=True)

    best = numpy.inf
    print 'Starting training'

    train = list_update(data[0], f_eval, options['batch_size'], rng=rng)
    test = list_update(data[-1], f_eval, options['batch_size'], rng=rng)
    starting = (train, test)
    print 'Pre-training. test: %f, train: %f' % (test, train)
    print 'Training'
    lr = options['lr']
    max_itr = options['max_itr']
    grad_norm = 0.
    train_scores = 50 * [0.]
    try:
        for epoch in xrange(max_itr):
            start_time = time.time()
            for g in gshared:
                # manually set gradients to 0 because we accumulate in list update
                g.set_value(0.0 * g.get_value())
            use_noise.set_value(1.)
            train_cost, n_obs = list_update(data[0],
                                            f_grad_shared,
                                            batchsize=options['batch_size'],
                                            rng=rng,
                                            return_n_obs=True)
            use_noise.set_value(0.)
            for g in gshared:
                g.set_value(floatx(g.get_value() / float(n_obs)))
            f_update(lr)
            apply_proximity(tparams, operators)
            train = list_update(data[0],
                                f_eval,
                                options['batch_size'],
                                rng=rng)
            elapsed_time = time.time() - start_time

            if train < best:
                # early stopping on training set
                test = list_update(data[-1], f_eval)
                best_par = unzip(tparams)
                best_perf = (train, test)
                best = train

            test = list_update(data[-1], f_eval)

            if (epoch % 50) == 0:
                # Save progress....
                save_progress(options, tparams, epoch, best_perf)
                print 'Epoch: %d, cost: %f, train: %f, test: %f, lr:%f, time: %f' % (
                    epoch, train_cost, train, test, lr, elapsed_time)

            # Check if we're diverging...
            train_ave = running_ave(train_scores, train, epoch)

            if epoch > 1000:
                # Only exit if we're diverging after 1000 iterations
                if train_ave > 1.03 * best_perf[0]:
                    print "Diverged..."
                    break
    except KeyboardInterrupt:
        print "Interrupted"
    # check that we're outputing prob distributions
    X = data[0][(3, 3)][0]
    assert abs(
        f_out(X.reshape(X.shape[0], 2, 3, 3)).sum() - float(X.shape[0])) < 1e-4
    print "Best performance:"
    print "train, test"
    print "%f,%f" % best_perf
    return best_perf, best_par
Beispiel #27
0
def train(
        dim_word=100,  # word vector dimensionality
        dim=1000,  # the number of LSTM units
        encoder='gru',
        decoder='gru_cond',
        patience=10,
        max_epochs=5000,
        dispFreq=100,
        decay_c=0.,
        alpha_c=0.,
        diag_c=0.,
        clip_c=-1.,
        lrate=0.01,
        n_words_src=100000,
        n_words=100000,
        maxlen=100,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=16,
        valid_batch_size=16,
        saveto='model.npz',
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        sampleFreq=100,  # generate some samples after every sampleFreq updates
        datasets=[
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'
        ],
        valid_datasets=[
            '../data/dev/newstest2011.en.tok',
            '../data/dev/newstest2011.fr.tok'
        ],
        dictionaries=[
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok.pkl',
            '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok.pkl'
        ],
        use_dropout=False,
        reload_=False):

    # Model options
    model_options = locals().copy()

    worddicts = [None] * len(dictionaries)
    worddicts_r = [None] * len(dictionaries)
    for ii, dd in enumerate(dictionaries):
        with open(dd, 'rb') as f:
            worddicts[ii] = pkl.load(f)
        worddicts_r[ii] = dict()
        for kk, vv in worddicts[ii].iteritems():
            worddicts_r[ii][vv] = kk

    # reload options
    if reload_ and os.path.exists(saveto):
        with open('%s.pkl' % saveto, 'rb') as f:
            models_options = pkl.load(f)

    print 'Loading data'
    train = TextIterator(datasets[0],
                         datasets[1],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words,
                         batch_size=batch_size,
                         maxlen=maxlen)
    valid = TextIterator(valid_datasets[0],
                         valid_datasets[1],
                         dictionaries[0],
                         dictionaries[1],
                         n_words_source=n_words_src,
                         n_words_target=n_words,
                         batch_size=valid_batch_size,
                         maxlen=maxlen)

    print 'Building model'
    params = init_params(model_options)
    # reload parameters
    if reload_ and os.path.exists(saveto):
        params = load_params(saveto, params)

    tparams = init_tparams(params)

    trng, use_noise, \
          x, x_mask, y, y_mask, \
          opt_ret, \
          cost = \
          build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    print 'Buliding sampler'
    f_init, f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile)
    print 'Done'

    cost = cost.mean()

    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in tparams.iteritems():
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # after any regularizer
    print 'Building f_cost...',
    f_cost = theano.function(inps, cost, profile=profile)
    print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(tparams))
    print 'Done'
    print 'Building f_grad...',
    f_grad = theano.function(inps, grads, profile=profile)
    print 'Done'

    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost)
    print 'Done'

    print 'Optimization'

    history_errs = []
    # reload history
    if reload_ and os.path.exists(saveto):
        history_errs = list(numpy.load(saveto)['history_errs'])
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size
    if sampleFreq == -1:
        sampleFreq = len(train[0]) / batch_size

    uidx = 0
    estop = False
    for eidx in xrange(max_epochs):
        n_samples = 0

        for x, y in train:
            n_samples += len(x)
            uidx += 1
            use_noise.set_value(1.)

            x, x_mask, y, y_mask = prepare_data(x,
                                                y,
                                                maxlen=maxlen,
                                                n_words_src=n_words_src,
                                                n_words=n_words)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            ud_start = time.time()
            cost = f_grad_shared(x, x_mask, y, y_mask)
            f_update(lrate)
            ud = time.time() - ud_start

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                return 1., 1., 1.

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'UD ', ud

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',

                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)

                numpy.savez(saveto, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'))
                print 'Done'

            if numpy.mod(uidx, sampleFreq) == 0:
                # FIXME: random selection?
                for jj in xrange(numpy.minimum(5, x.shape[1])):
                    stochastic = True
                    sample, score = gen_sample(tparams,
                                               f_init,
                                               f_next,
                                               x[:, jj][:, None],
                                               model_options,
                                               trng=trng,
                                               k=1,
                                               maxlen=30,
                                               stochastic=stochastic,
                                               argmax=False)
                    print 'Source ', jj, ': ',
                    for vv in x[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[0]:
                            print worddicts_r[0][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Truth ', jj, ' : ',
                    for vv in y[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            print worddicts_r[1][vv],
                        else:
                            print 'UNK',
                    print
                    print 'Sample ', jj, ': ',
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            print worddicts_r[1][vv],
                        else:
                            print 'UNK',
                    print

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, prepare_data,
                                        model_options, valid)
                valid_err = valid_errs.mean()
                history_errs.append(valid_err)

                if uidx == 0 or valid_err <= numpy.array(history_errs).min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                if len(history_errs) > patience and valid_err >= numpy.array(
                        history_errs)[:-patience].min():
                    bad_counter += 1
                    if bad_counter > patience:
                        print 'Early Stop!'
                        estop = True
                        break

                if numpy.isnan(valid_err):
                    import ipdb
                    ipdb.set_trace()

                print 'Valid ', valid_err

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid_err = pred_probs(f_log_probs, prepare_data, model_options,
                           valid).mean()
    print 'Valid ', valid_err
    params = copy.copy(best_p)
    numpy.savez(saveto,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err
def train(
        dim_word_desc=400,  # word vector dimensionality
        dim_word_q=400,
        dim_word_ans=600,
        dim_proj=300,
        dim=400,  # the number of LSTM units
        encoder_desc='lstm',
        encoder_desc_word='lstm',
        encoder_desc_sent='lstm',
        use_dq_sims=False,
        eyem=None,
        learn_h0=False,
        use_desc_skip_c_g=False,
        debug=False,
        encoder_q='lstm',
        patience=10,
        max_epochs=5000,
        dispFreq=100,
        decay_c=0.,
        alpha_c=0.,
        clip_c=-1.,
        lrate=0.01,
        n_words_q=49145,
        n_words_desc=115425,
        n_words_ans=409,
        pkl_train_files=None,
        pkl_valid_files=None,
        maxlen=2000,  # maximum length of the description
        optimizer='rmsprop',
        batch_size=2,
        vocab=None,
        valid_batch_size=16,
        use_elu_g=False,
        saveto='model.npz',
        model_dir=None,
        ms_nlayers=3,
        validFreq=1000,
        saveFreq=1000,  # save the parameters after every saveFreq updates
        datasets=[None],
        truncate=400,
        momentum=0.9,
        use_bidir=False,
        cost_mask=None,
        valid_datasets=[
            '/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5',
            '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5'
        ],
        dropout_rate=0.5,
        use_dropout=True,
        reload_=True,
        **opt_ds):

    ensure_dir_exists(model_dir)
    mpath = os.path.join(model_dir, saveto)
    mpath_best = os.path.join(model_dir, prfx("best", saveto))
    mpath_last = os.path.join(model_dir, prfx("last", saveto))
    mpath_stats = os.path.join(model_dir, prfx("stats", saveto))

    # Model options
    model_options = locals().copy()
    model_options['use_sent_reps'] = opt_ds['use_sent_reps']
    stats = defaultdict(list)

    del model_options['eyem']
    del model_options['cost_mask']

    if cost_mask is not None:
        cost_mask = sharedX(cost_mask)

    # reload options and parameters
    if reload_:
        print "Reloading the model."
        if os.path.exists(mpath_best):
            print "Reloading the best model from %s." % mpath_best
            with open(os.path.join(mpath_best, '%s.pkl' % mpath_best),
                      'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath_best, params)
        elif os.path.exists(mpath):
            print "Reloading the model from %s." % mpath
            with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f:
                models_options = pkl.load(f)
            params = init_params(model_options)
            params = load_params(mpath, params)
        else:
            raise IOError("Couldn't open the file.")
    else:
        print "Couldn't reload the models initializing from scratch."
        params = init_params(model_options)

    if datasets[0]:
        print "Short dataset", datasets[0]

    print 'Loading data'
    print 'Building model'
    if pkl_train_files is None or pkl_valid_files is None:
        train, valid, test = load_data(path=datasets[0],
                                       valid_path=valid_datasets[0],
                                       test_path=valid_datasets[1],
                                       batch_size=batch_size,
                                       **opt_ds)
    else:
        train, valid, test = load_pkl_data(train_file_paths=pkl_train_files,
                                           valid_file_paths=pkl_valid_files,
                                           batch_size=batch_size,
                                           vocab=vocab,
                                           eyem=eyem,
                                           **opt_ds)

    tparams = init_tparams(params)
    trng, use_noise, inps_d, \
                     opt_ret, \
                     cost, errors, ent_errors, ent_derrors, probs = \
                        build_model(tparams,
                                    model_options,
                                    prepare_data if not opt_ds['use_sent_reps'] \
                                            else prepare_data_sents,
                                    valid,
                                    cost_mask=cost_mask)

    alphas = opt_ret['dec_alphas']

    if opt_ds['use_sent_reps']:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'],
                inps_d['slen'], inps_d['qlen'],\
                inps_d['ent_mask']
                ]
    else:
        inps = [inps_d["desc"], \
                inps_d["word_mask"], \
                inps_d["q"], \
                inps_d['q_mask'], \
                inps_d['ans'], \
                inps_d['wlen'], \
                inps_d['qlen'], \
                inps_d['ent_mask']]

    outs = [cost, errors, probs, alphas]
    if ent_errors:
        outs += [ent_errors]

    if ent_derrors:
        outs += [ent_derrors]

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, outs, profile=profile)
    print 'Done'

    # Apply weight decay on the feed-forward connections
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.

        for kk, vv in tparams.iteritems():
            if "logit" in kk or "ff" in kk:
                weight_decay += (vv**2).sum()

        weight_decay *= decay_c
        cost += weight_decay

    # after any regularizer
    print 'Computing gradient...',
    grads = safe_grad(cost, itemlist(tparams))
    print 'Done'

    # Gradient clipping:
    if clip_c > 0.:
        g2 = get_norms(grads)
        for p, g in grads.iteritems():
            grads[p] = tensor.switch(g2 > (clip_c**2),
                                     (g / tensor.sqrt(g2 + 1e-8)) * clip_c, g)
    inps.pop()
    if optimizer.lower() == "adasecant":
        learning_rule = Adasecant(delta_clip=25.0,
                                  use_adagrad=True,
                                  grad_clip=0.25,
                                  gamma_clip=0.)
    elif optimizer.lower() == "rmsprop":
        learning_rule = RMSPropMomentum(init_momentum=momentum)
    elif optimizer.lower() == "adam":
        learning_rule = Adam()
    elif optimizer.lower() == "adadelta":
        learning_rule = AdaDelta()

    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',
    learning_rule = None

    if learning_rule:
        f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr,
                                                          grads=grads,
                                                          inp=inps,
                                                          cost=cost,
                                                          errors=errors)
    else:
        f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps,
                                                  cost, errors)

    print 'Done'
    print 'Optimization'
    history_errs = []
    # reload history
    if reload_ and os.path.exists(mpath):
        history_errs = list(numpy.load(mpath)['history_errs'])

    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size

    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    best_found = False
    uidx = 0
    estop = False

    train_cost_ave, train_err_ave, \
            train_gnorm_ave = reset_train_vals()

    for eidx in xrange(max_epochs):
        n_samples = 0

        if train.done:
            train.reset()

        for d_, q_, a, em in train:
            n_samples += len(a)
            uidx += 1
            use_noise.set_value(1.)

            if opt_ds['use_sent_reps']:
                # To mask the description and the question.
                d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents(
                    d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(
                    d, d_mask, q, q_mask, a, dlen, slen, qlen)
            else:
                d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_)

                if d is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    uidx -= 1
                    continue

                ud_start = time.time()
                cost, errors, gnorm, pnorm = f_grad_shared(
                    d, d_mask, q, q_mask, a, dlen, qlen)

            upnorm = f_update(lrate)
            ud = time.time() - ud_start

            # Collect the running ave train stats.
            train_cost_ave = running_ave(train_cost_ave, cost)
            train_err_ave = running_ave(train_err_ave, errors)
            train_gnorm_ave = running_ave(train_gnorm_ave, gnorm)

            if numpy.isnan(cost) or numpy.isinf(cost):
                print 'NaN detected'
                import ipdb
                ipdb.set_trace()

            if numpy.mod(uidx, dispFreq) == 0:
                print 'Epoch ', eidx, ' Update ', uidx, \
                        ' Cost ', cost, ' UD ', ud, \
                        ' UpNorm ', upnorm[0].tolist(), \
                        ' GNorm ', gnorm, \
                        ' Pnorm ', pnorm, 'Terrors ', errors

            if numpy.mod(uidx, saveFreq) == 0:
                print 'Saving...',
                if best_p is not None and best_found:
                    numpy.savez(mpath_best,
                                history_errs=history_errs,
                                **best_p)
                    pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb'))
                else:
                    params = unzip(tparams)

                numpy.savez(mpath, history_errs=history_errs, **params)
                pkl.dump(model_options, open('%s.pkl' % mpath, 'wb'))
                pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb'))

                print 'Done'
                print_param_norms(tparams)

            if numpy.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                if valid.done:
                    valid.reset()

                valid_costs, valid_errs, valid_probs, \
                        valid_alphas, error_ent, error_dent = eval_model(f_log_probs,
                                                  prepare_data if not opt_ds['use_sent_reps'] \
                                                    else prepare_data_sents,
                                                  model_options,
                                                  valid,
                                                  use_sent_rep=opt_ds['use_sent_reps'])

                valid_alphas_ = numpy.concatenate(
                    [va.argmax(0) for va in valid_alphas.tolist()], axis=0)
                valid_err = valid_errs.mean()
                valid_cost = valid_costs.mean()
                valid_alpha_ent = -negentropy(valid_alphas)

                mean_valid_alphas = valid_alphas_.mean()
                std_valid_alphas = valid_alphas_.std()

                mean_valid_probs = valid_probs.argmax(1).mean()
                std_valid_probs = valid_probs.argmax(1).std()

                history_errs.append([valid_cost, valid_err])

                stats['train_err_ave'].append(train_err_ave)
                stats['train_cost_ave'].append(train_cost_ave)
                stats['train_gnorm_ave'].append(train_gnorm_ave)

                stats['valid_errs'].append(valid_err)
                stats['valid_costs'].append(valid_cost)
                stats['valid_err_ent'].append(error_ent)
                stats['valid_err_desc_ent'].append(error_dent)

                stats['valid_alphas_mean'].append(mean_valid_alphas)
                stats['valid_alphas_std'].append(std_valid_alphas)
                stats['valid_alphas_ent'].append(valid_alpha_ent)

                stats['valid_probs_mean'].append(mean_valid_probs)
                stats['valid_probs_std'].append(std_valid_probs)

                if uidx == 0 or valid_err <= numpy.array(
                        history_errs)[:, 1].min():
                    best_p = unzip(tparams)
                    bad_counter = 0
                    best_found = True
                else:
                    bst_found = False

                if numpy.isnan(valid_err):
                    import ipdb
                    ipdb.set_trace()

                print "============================"
                print '\t>>>Valid error: ', valid_err, \
                        ' Valid cost: ', valid_cost
                print '\t>>>Valid pred mean: ', mean_valid_probs, \
                        ' Valid pred std: ', std_valid_probs
                print '\t>>>Valid alphas mean: ', mean_valid_alphas, \
                        ' Valid alphas std: ', std_valid_alphas, \
                        ' Valid alpha negent: ', valid_alpha_ent, \
                        ' Valid error ent: ', error_ent, \
                        ' Valid error desc ent: ', error_dent

                print "============================"
                print "Running average train stats "
                print '\t>>>Train error: ', train_err_ave, \
                        ' Train cost: ', train_cost_ave, \
                        ' Train grad norm: ', train_gnorm_ave
                print "============================"


                train_cost_ave, train_err_ave, \
                    train_gnorm_ave = reset_train_vals()

        print 'Seen %d samples' % n_samples

        if estop:
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    valid.reset()
    valid_cost, valid_error, valid_probs, \
            valid_alphas, error_ent = eval_model(f_log_probs,
                                      prepare_data if not opt_ds['use_sent_reps'] \
                                           else prepare_data_sents,
                                      model_options, valid,
                                      use_sent_rep=opt_ds['use_sent_rep'])

    print " Final eval resuts: "
    print 'Valid error: ', valid_error.mean()
    print 'Valid cost: ', valid_cost.mean()
    print '\t>>>Valid pred mean: ', valid_probs.mean(), \
            ' Valid pred std: ', valid_probs.std(), \
            ' Valid error ent: ', error_ent

    params = copy.copy(best_p)

    numpy.savez(mpath_last,
                zipped_params=best_p,
                history_errs=history_errs,
                **params)

    return valid_err, valid_cost
Beispiel #29
0
def train(
        experiment_id,
        model_options,
        data_options,
        validation_options,
        patience,  # early stopping patience
        max_epochs,
        finish_after,  # finish after this many updates
        decay_c,  # L2 regularization penalty
        alpha_c,  # alignment regularization
        clip_c,  # gradient clipping threshold
        lrate,  # learning rate
        optimizer,
        saveto,
        valid_freq,
        eval_intv,  # time interval for evaluation in minutes
        save_freq,  # save the parameters after every saveFreq updates
        sample_freq,  # generate some samples after every sampleFreq
        reload_=False):

    worddicts_r, train_stream, valid_stream = load_data(**data_options)

    LOGGER.info('Building model')
    params = init_params(model_options)
    # reload parameters
    model_filename = '{}.model.npz'.format(experiment_id)
    model_option_filename = '{}.config.json'.format(experiment_id)
    saveto_filename = '{}.npz'.format(saveto)
    if reload_ and os.path.exists(saveto_filename):
        LOGGER.info('Loading parameters from {}'.format(saveto_filename))
        params = load_params(saveto_filename, params)

    LOGGER.info('Initializing parameters')
    tparams = init_tparams(params)

    # use_noise is for dropout
    trng, use_noise, \
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost = \
        build_model(tparams, model_options)
    inps = [x, x_mask, y, y_mask]

    LOGGER.info('Building sampler')
    f_init, f_next = build_sampler(tparams, model_options, trng)

    # before any regularizer
    LOGGER.info('Building f_log_probs')
    f_log_probs = theano.function(inps, cost, profile=False)

    cost = cost.mean()

    # apply L2 regularization on weights
    if decay_c > 0.:
        decay_c = theano.shared(numpy.float32(decay_c), name='decay_c')
        weight_decay = 0.
        for kk, vv in six.iteritems(tparams):
            weight_decay += (vv**2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    # regularize the alpha weights
    if alpha_c > 0. and not model_options['decoder'].endswith('simple'):
        alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c')
        alpha_reg = alpha_c * (
            (tensor.cast(y_mask.sum(0) // x_mask.sum(0), 'float32')[:, None] -
             opt_ret['dec_alphas'].sum(0))**2).sum(1).mean()
        cost += alpha_reg

    # Not used?
    # after all regularizers - compile the computational graph for cost
    # LOGGER.info('Building f_cost')
    # f_cost = theano.function(inps, cost, profile=False)

    LOGGER.info('Computing gradient')
    grads = tensor.grad(cost, wrt=itemlist(tparams))

    # apply gradient clipping here
    if clip_c > 0.:
        g2 = 0.
        for g in grads:
            g2 += (g**2).sum()
        new_grads = []
        for g in grads:
            new_grads.append(
                tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c,
                              g))
        grads = new_grads

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    LOGGER.info('Building optimizers')
    f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams,
                                                             grads, inps, cost)

    LOGGER.info('Optimization')

    log = Logger(filename='{}.log.jsonl.gz'.format(experiment_id))

    # evaluation score will be stored into the following queue
    valid_ret_queue = Queue.Queue()
    process_queue = Queue.Queue()

    rt = prepare_validation_timer(tparams, process_queue, model_filename,
                                  model_option_filename, eval_intv,
                                  valid_ret_queue, **validation_options)
    rt.start()

    def _timer_signal_handler(signum, frame):
        LOGGER.info('Received SIGINT')
        LOGGER.info('Now attempting to stop the timer')
        rt.stop()

        LOGGER.info('Please wait for terminating all child processes')
        while not process_queue.empty():
            proc = process_queue.get()
            if proc.poll() is None:  # check if the process has terminated
                # child process is still working
                # LOGGER.info('Attempt to kill', proc.pid)
                # terminate it by sending an interrupt signal
                proc.send_signal(signal.SIGINT)

                # wait for child process while avoiding deadlock
                # ignore outputs
                proc.communicate()

        sys.exit(130)

    signal.signal(signal.SIGINT, _timer_signal_handler)

    train_start = time.clock()
    best_p = None
    best_score = 0
    bad_counter = 0

    uidx = 0
    estop = False

    for eidx in xrange(max_epochs):
        n_samples = 0

        for x, x_mask, y, y_mask in train_stream.get_epoch_iterator():
            n_samples += len(x)
            x, x_mask, y, y_mask = x.T, x_mask.T, y.T, y_mask.T

            use_noise.set_value(1.)

            uidx += 1
            log_entry = {'iteration': uidx, 'epoch': eidx}

            # compute cost, grads and copy grads to shared variables
            update_start = time.clock()
            cost = f_grad_shared(x, x_mask, y, y_mask)
            f_update(lrate)

            log_entry['cost'] = float(cost)
            log_entry['average_source_length'] = float(x_mask.sum(0).mean())
            log_entry['average_target_length'] = float(y_mask.sum(0).mean())
            log_entry['update_time'] = time.clock() - update_start
            log_entry['train_time'] = time.clock() - train_start

            # check for bad numbers, usually we remove non-finite elements
            # and continue training - but not done here
            if not numpy.isfinite(cost):
                LOGGER.error('NaN detected')
                return 1., 1., 1.

            # save the best model so far
            if numpy.mod(uidx, save_freq) == 0:
                LOGGER.info('Saving best model so far')

                if best_p is not None:
                    params = best_p
                else:
                    params = unzip(tparams)

                # save params to exp_id.npz and symlink model.npz to it
                save_params(params, model_filename, saveto_filename)

            # generate some samples with the model and display them
            if numpy.mod(uidx, sample_freq) == 0:
                # FIXME: random selection?
                log_entry['samples'] = []
                for jj in xrange(numpy.minimum(5, x.shape[1])):
                    log_entry['samples'].append({
                        'source': '',
                        'truth': '',
                        'sample': ''
                    })
                    stochastic = True
                    sample, _, score = gen_sample(tparams,
                                                  f_init,
                                                  f_next,
                                                  x[:, jj][:, None],
                                                  model_options,
                                                  trng=trng,
                                                  k=1,
                                                  maxlen=30,
                                                  stochastic=stochastic,
                                                  argmax=False)
                    for vv in x[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[0]:
                            token = worddicts_r[0][vv]
                        else:
                            token = UNK_TOKEN
                        log_entry['samples'][-1]['source'] += token + ' '
                    for vv in y[:, jj]:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            token = worddicts_r[1][vv]
                        else:
                            token = UNK_TOKEN
                        log_entry['samples'][-1]['truth'] += token + ' '
                    if stochastic:
                        ss = sample
                    else:
                        score = score / numpy.array([len(s) for s in sample])
                        ss = sample[score.argmin()]
                    for vv in ss:
                        if vv == 0:
                            break
                        if vv in worddicts_r[1]:
                            token = worddicts_r[1][vv]
                        else:
                            token = UNK_TOKEN
                        log_entry['samples'][-1]['sample'] += token + ' '

            # validate model on validation set and early stop if necessary
            if numpy.mod(uidx, valid_freq) == 0:
                use_noise.set_value(0.)
                valid_errs = pred_probs(f_log_probs, model_options,
                                        valid_stream)
                valid_err = valid_errs.mean()
                log_entry['validation_cost'] = float(valid_err)

                if not numpy.isfinite(valid_err):
                    raise RuntimeError('NaN detected in validation error')

            # collect validation scores (e.g., BLEU) from the child thread
            if not valid_ret_queue.empty():

                (ret_model, scores) = valid_ret_queue.get()

                valid_bleu = scores[0]
                # LOGGER.info('BLEU on the validation set: %.2f' % valid_bleu)
                log_entry['validation_bleu'] = valid_bleu

                if valid_bleu > best_score:
                    best_p = ret_model
                    best_score = valid_bleu
                    bad_counter = 0
                else:
                    bad_counter += 1
                    if bad_counter > patience:
                        estop = True
                        break

            # finish after this many updates
            if uidx >= finish_after:
                LOGGER.info('Finishing after {} iterations'.format(uidx))
                estop = True
                break

            log.log(log_entry)

        LOGGER.info('Completed epoch, seen {} samples'.format(n_samples))

        if estop:
            log.log(log_entry)
            break

    if best_p is not None:
        zipp(best_p, tparams)

    use_noise.set_value(0.)
    LOGGER.info('Calculating validation cost')
    valid_err = pred_probs(f_log_probs, model_options, valid_stream).mean()

    if not best_p:
        best_p = unzip(tparams)

    params = copy.copy(best_p)
    save_params(params, model_filename, saveto_filename)

    rt.stop()

    return valid_err
def train():
    if prm.optimizer.lower() == 'adam':
        optimizer = adam
    elif prm.optimizer.lower() == 'sgd':
        optimizer = sgd
    elif prm.optimizer.lower() == 'rmsprop':
        optimizer = rmsprop
    elif prm.optimizer.lower() == 'adadelta':
        optimizer = adadelta

    options = locals().copy()

    print 'parameters:', str(options)

    prm_k = vars(prm).keys()
    prm_d = vars(prm)
    prm_k.sort()

    for x in prm_k:
        if not x.startswith('__'):
            print x, '=', prm_d[x]

    print 'loading Vocabulary...'
    vocab = utils.load_vocab(prm.vocab_path, prm.n_words)
    options['vocab'] = vocab

    options['vocabinv'] = {}
    for k, v in vocab.items():
        options['vocabinv'][v] = k

    print options

    print 'Loading Environment...'
    if prm.engine.lower() == 'lucene':
        import lucene_search
        options['engine'] = lucene_search.LuceneSearch()
    elif prm.engine.lower() == 'elastic':
        import elastic_search
        options['engine'] = elastic_search.ElasticSearch()

    print 'Loading Dataset...'
    dh5 = dataset_hdf5.DatasetHDF5(prm.dataset_path)

    qi_train = dh5.get_queries(dset='train')
    dt_train = dh5.get_doc_ids(dset='train')
    qi_valid = dh5.get_queries(dset='valid')
    dt_valid = dh5.get_doc_ids(dset='valid')
    qi_test = dh5.get_queries(dset='test')
    dt_test = dh5.get_doc_ids(dset='test')

    if prm.train_size == -1:
        train_size = len(qi_train)
    else:
        train_size = min(prm.train_size, len(qi_train))

    if prm.valid_size == -1:
        valid_size = len(qi_valid)
    else:
        valid_size = min(prm.valid_size, len(qi_valid))

    if prm.test_size == -1:
        test_size = len(qi_test)
    else:
        test_size = min(prm.test_size, len(qi_test))

    print '%d train examples' % len(qi_train)
    print '%d valid examples' % len(qi_valid)
    print '%d test examples' % len(qi_test)

    # This create the initial parameters as np ndarrays.
    # Dict name (string) -> np ndarray
    params, exclude_params = utils.init_params(options)

    if prm.wordemb_path:
        print 'loading pre-trained word embeddings'
        params = utils.load_wemb(params, vocab)
        options['W'] = params['W']

    if prm.reload_model:
        utils.load_params(prm.reload_model, params)

    print 'Building model'
    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = utils.init_tparams(params)
    for kk, value in tparams.iteritems():
        tparams[kk] = theano.shared(value, name=kk)

    iin, out, updates, f_pred, consider_constant \
        = build_model(tparams, options)

    # get only parameters that are not in the exclude_params list
    tparams_ = OrderedDict([(kk, vv) for kk, vv in tparams.iteritems()
                            if kk not in exclude_params])

    grads = tensor.grad(out[0],
                        wrt=utils.itemlist(tparams_),
                        consider_constant=consider_constant)

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams_, grads, iin, out, updates)

    history_errs = []
    best_p = None

    if prm.validFreq == -1:
        validFreq = len(qi_train) / prm.batch_size_train
    else:
        validFreq = prm.validFreq

    if prm.saveFreq == -1:
        saveFreq = len(qi_train) / prm.batch_size_train
    else:
        saveFreq = prm.saveFreq

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.time()

    print 'Optimization'

    try:
        for eidx in xrange(prm.max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = utils.get_minibatches_idx(len(qi_train),
                                           prm.batch_size_train,
                                           shuffle=True)

            for _, train_index in kf:
                st = time.time()

                uidx += 1
                qi, qi_i, qi_lst, D_gt_id, D_gt_url = get_samples(
                    qi_train, dt_train, train_index, options)

                # share the current queries with the search engine.
                options['current_queries'] = qi_lst

                n_samples += len(qi)

                is_train = 1.
                out = f_grad_shared(qi_i, D_gt_id, is_train)

                cost = out.pop(0)
                cost_ent = out.pop(0)

                lr_t = f_update(prm.lrate)

                if np.isnan(cost) or np.isinf(cost):
                    print 'NaN detected'
                    return 1., 1., 1.

                print "options['reformulated_queries']", options[
                    'reformulated_queries']

                if np.mod(uidx, prm.dispFreq) == 0:

                    print '\n================================================================================'
                    print 'Epoch', eidx, 'Update', uidx, 'Cost', cost, 'LR_t', lr_t
                    print 'Time Minibatch Update: ' + str(time.time() - st)
                    print 'Input Query:       ', qi[0].replace('\n', '\\n')
                    print
                    print 'Target Docs:       ', str(D_gt_url[0])
                    print
                    print 'Input Query Vocab: ', utils.idx2text(
                        qi_i[0], options['vocabinv'])
                    for ii in range(prm.n_iterations):
                        prob = out.pop(0)
                        ans = out.pop(0)
                        metrics = out.pop(0)
                        bl = out.pop(0)
                        cost_bl = out.pop(0)
                        D_id = out.pop(0)

                        print "prob", prob
                        print "ans", ans
                        print "bl", bl
                        print "cost_bl", cost_bl
                        print "D_id", D_id

                        print("current_queries",
                              len(options['current_queries']),
                              options['current_queries'])
                        print
                        print 'Iteration', ii
                        print 'Baseline Value', bl.mean(), 'Cost', cost_bl
                        print '  '.join(prm.metrics_map.keys())
                        print metrics.mean(0)
                        print
                        i = 7
                        print 'Retrieved Docs:    ', str([
                            options['engine'].id_title_map[d_id]
                            for d_id in D_id[i]
                        ])
                        print
                        print 'Reformulated Query:', options[
                            'reformulated_queries'][ii][i]
                        print 'Current queries:', options['current_queries'][i]
                        print
                        print 'Query ANS:         ',
                        for kk, word in enumerate(
                                options['current_queries'][i][:ans.shape[1]]):
                            print "kk, word", kk, word
                            if word not in options['vocab'] and word != '':
                                word += '<unk>'
                            if ans[0, kk] == 1:
                                word = word.upper()
                            print str(word),
                        print
                        print
                        print 'prob[:,:,0].max(1).mean(), prob[:,:,0].mean(), prob[:,:,0].min(1).mean()', prob[:, :, 0].max(
                            1).mean(), prob[:, :,
                                            0].mean(), prob[:, :,
                                                            0].min(1).mean()
                        print 'prob[:,:,1].max(1).mean(), prob[:,:,1].mean(), prob[:,:,1].min(1).mean()', prob[:, :, 1].max(
                            1).mean(), prob[:, :,
                                            1].mean(), prob[:, :,
                                                            1].min(1).mean()
                    print '==================================================================================\n'

                if np.mod(uidx, validFreq) == 0 or uidx == 1:

                    kf_train = utils.get_minibatches_idx(
                        len(qi_train),
                        prm.batch_size_pred,
                        shuffle=True,
                        max_samples=train_size)
                    kf_valid = utils.get_minibatches_idx(
                        len(qi_valid),
                        prm.batch_size_pred,
                        shuffle=True,
                        max_samples=valid_size)
                    kf_test = utils.get_minibatches_idx(len(qi_test),
                                                        prm.batch_size_pred,
                                                        shuffle=True,
                                                        max_samples=test_size)

                    print '\nEvaluating - Training Set'
                    train_metrics = pred_error(f_pred, qi_train, dt_train,
                                               options, kf_train)
                    exit()

                    print '\nEvaluating - Validation Set'
                    valid_metrics = pred_error(f_pred, qi_valid, dt_valid,
                                               options, kf_valid)

                    print '\nEvaluating - Test Set'
                    test_metrics = pred_error(f_pred, qi_test, dt_test,
                                              options, kf_test)

                    his = [train_metrics, valid_metrics, test_metrics]
                    history_errs.append(his)
                    metric_idx = prm.metrics_map[prm.reward.upper()]
                    if (uidx == 0 or valid_metrics[-1, metric_idx] >=
                            np.array(history_errs)[:, 1, -1,
                                                   metric_idx].max()):
                        best_p = utils.unzip(tparams)
                        bad_counter = 0

                    print '====================================================================================================='
                    print '  '.join(prm.metrics_map.keys())
                    print
                    print 'Train:'
                    print train_metrics
                    print
                    print 'Valid:'
                    print valid_metrics
                    print
                    print 'Test:'
                    print test_metrics
                    print
                    print '====================================================================================================='
                    if (len(history_errs) > prm.patience
                            and valid_metrics[-1, metric_idx] <=
                            np.array(history_errs)[:-prm.patience, 1, -1,
                                                   metric_idx].max()):
                        bad_counter += 1
                        if bad_counter > prm.patience:
                            print 'Early Stop!'
                            estop = True
                            break

                if prm.saveto and np.mod(uidx, saveFreq) == 0:
                    print 'Saving...',

                    if best_p is not None:
                        params = best_p
                    else:
                        params = utils.unzip(tparams)
                    np.savez(prm.saveto, history_errs=history_errs, **params)

                    print 'Done'

            print 'Seen %d samples' % n_samples

            if estop:
                break

    except KeyboardInterrupt:
        print "Training interupted"
    return