def train_classifier(train, valid, test, W, n_words=10000, n_x=300, n_h=200, 
    dropout_val=0.5, patience=10, max_epochs=20, lrate=0.0002, 
    batch_size=50, valid_batch_size=50, dispFreq=10, validFreq=100, 
    saveFreq=200, saveto = 'trec_gru_result.npz'):
        
    """ train, valid, test : datasets
        W : the word embedding initialization
        n_words : vocabulary size
        n_x : word embedding dimension
        n_h : LSTM/GRU number of hidden units 
        dropout_val: dropput probability
        patience : Number of epoch to wait before early stop if no progress
        max_epochs : The maximum number of epoch to run
        lrate : learning rate
        batch_size : batch size during training
        valid_batch_size : The batch size used for validation/test set
        dispFreq : Display to stdout the training progress every N updates
        validFreq : Compute the validation error after this number of update.
        saveFreq: save the result after this number of update.
        saveto: where to save the result.
    """

    options = {}
    options['n_words'] = n_words
    options['n_x'] = n_x
    options['n_h'] = n_h
    options['patience'] = patience
    options['max_epochs'] = max_epochs
    options['lrate'] = lrate
    options['batch_size'] = batch_size
    options['valid_batch_size'] = valid_batch_size
    options['dispFreq'] = dispFreq
    options['validFreq'] = validFreq
    
    logger.info('Model options {}'.format(options))
    
    logger.info('{} train examples'.format(len(train[0])))
    logger.info('{} valid examples'.format(len(valid[0])))
    logger.info('{} test examples'.format(len(test[0])))

    logger.info('Building model...')
    
    n_y = np.max(train[1]) + 1
    options['n_y'] = n_y
    
    params = init_params(options,W)
    tparams = init_tparams(params)

    (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams,options)
    
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = Adam(tparams, cost, [x, mask, y], lr)

    logger.info('Training model...')
    
    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    estop = False  # early stop
    history_errs = []
    best_p = None
    bad_counter = 0    
    uidx = 0  # the number of update done
    start_time = time.time()
    
    try:
        for eidx in xrange(max_epochs):
            
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(dropout_val)

                y = [train[1][t] for t in train_index]
                x = [train[0][t]for t in train_index]
                                
                x, mask, y = prepare_data(x, y)

                cost = f_grad_shared(x, mask, y)
                f_update(lrate)

                if np.isnan(cost) or np.isinf(cost):
                    logger.info('NaN detected')
                    return 1., 1., 1.

                if np.mod(uidx, dispFreq) == 0:
                    logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, cost))
                    
                if np.mod(uidx, saveFreq) == 0:
                    logger.info('Saving ...')
                    
                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                        np.savez(saveto, history_errs=history_errs, **params)
                    
                    logger.info('Done ...')

                if np.mod(uidx, validFreq) == 0:
                    
                    use_noise.set_value(0.)
                    
                    train_err = pred_error(f_pred, prepare_data, train, kf)
                    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
                    test_err = pred_error(f_pred, prepare_data, test, kf_test)
                    history_errs.append([valid_err, test_err, train_err])
                   
                        
                    if (uidx == 0 or
                        valid_err <= np.array(history_errs)[:,0].min()):

                        best_p = unzip(tparams)
                        bad_counter = 0

                    logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err))

                    if (len(history_errs) > patience and
                        valid_err >= np.array(history_errs)[:-patience,0].min()):
                        bad_counter += 1
                        if bad_counter > patience:
                            
                            logger.info('Early Stop!')
                            estop = True
                            break

            if estop:
                break

    except KeyboardInterrupt:
        logger.info('Training interupted')

    end_time = time.time()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)
    
    use_noise.set_value(0.)
    
    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
    train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted)
    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
    test_err = pred_error(f_pred, prepare_data, test, kf_test)
    
    logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err))
    
    np.savez(saveto, train_err=train_err,
             valid_err=valid_err, test_err=test_err,
             history_errs=history_errs, **best_p)
    
    logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, 
                 (end_time - start_time) / (1. * (eidx + 1))))
    
    return train_err, valid_err, test_err
Ejemplo n.º 2
0
def train_model(train, valid, test, img_feats, W, n_words=7414, n_x=300, n_h=512,
    max_epochs=20, lrate=0.001, batch_size=64, valid_batch_size=64, dropout_val=0.5,
    dispFreq=10, validFreq=500, saveFreq=1000, saveto = 'flickr30k_result_psgld_dropout.npz'):
        
    """ n_words : vocabulary size
        n_x : word embedding dimension
        n_h : LSTM/GRU number of hidden units 
        max_epochs : The maximum number of epoch to run
        lrate : learning rate
        batch_size : batch size during training
        valid_batch_size : The batch size used for validation/test set
        dropout_val : the probability of dropout
        dispFreq : Display to stdout the training progress every N updates
        validFreq : Compute the validation error after this number of update.
        saveFreq : save results after this number of update.
        saveto : where to save.
    """

    options = {}
    options['n_words'] = n_words
    options['n_x'] = n_x
    options['n_h'] = n_h
    options['max_epochs'] = max_epochs
    options['lrate'] = lrate
    options['batch_size'] = batch_size
    options['valid_batch_size'] = valid_batch_size
    options['dispFreq'] = dispFreq
    options['validFreq'] = validFreq
    options['saveFreq'] = saveFreq
    
    options['n_z'] = img_feats.shape[0]
   
    logger.info('Model options {}'.format(options))
    logger.info('{} train examples'.format(len(train[0])))
    logger.info('{} valid examples'.format(len(valid[0])))
    logger.info('{} test examples'.format(len(test[0])))

    logger.info('Building model...')
    
    params = init_params(options,W)
    tparams = init_tparams(params)

    (use_noise, x, mask, z, f_pred_prob, cost) = build_model(tparams,options)
    
    f_cost = theano.function([x, mask, z], cost, name='f_cost')
    
    lr_theano = tensor.scalar(name='lr')
    ntrain_theano = tensor.scalar(name='ntrain')
    f_grad_shared, f_update = pSGLD(tparams, cost, [x, mask,z], ntrain_theano, lr_theano)

    logger.info('Training model...')

    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)
    
    estop = False  # early stop
    history_negll = []
    best_p = None
    best_valid_negll, best_test_negll = 0., 0.
    bad_counter = 0    
    uidx = 0  # the number of update done
    start_time = time.time()
    
    # statistics of data
    train_num_words, valid_num_words, test_num_words = 0, 0, 0
    for sent in train[0]:
        train_num_words = train_num_words + len(sent)
    for sent in valid[0]:
        valid_num_words = valid_num_words + len(sent)
    for sent in test[0]:
        test_num_words = test_num_words + len(sent)
    
    n_average = 0
    valid_probs = np.zeros((valid_num_words,))
    test_probs = np.zeros((test_num_words,)) 
    
    try:
        for eidx in xrange(max_epochs):
            n_samples = 0
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(dropout_val)

                x = [train[0][t]for t in train_index]
                z = np.array([img_feats[:,train[1][t]]for t in train_index])
                
                x, mask = prepare_data(x)
                n_samples += x.shape[1]

                cost = f_grad_shared(x, mask,z)
                f_update(lrate,len(train[0]))

                if np.isnan(cost) or np.isinf(cost):
                    logger.info('NaN detected')
                    return 1., 1., 1.

                if np.mod(uidx, dispFreq) == 0:
                    logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, cost))
                    
                if np.mod(uidx, saveFreq) == 0:
                    logger.info('Saving ...')
                
                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    np.savez(saveto, history_negll=history_negll, **params)
                    logger.info('Done ...')

                if np.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.)
                    
                    if eidx < 3: 
                        valid_negll = calu_negll(f_cost, prepare_data, valid, img_feats, kf_valid)
                        test_negll = calu_negll(f_cost, prepare_data, test, img_feats, kf_test)
                        history_negll.append([valid_negll, test_negll])
                    else:
                        valid_probs_curr = calu_pred_prob(f_pred_prob, prepare_data, valid, img_feats, kf_valid)
                        test_probs_curr = calu_pred_prob(f_pred_prob, prepare_data, test, img_feats, kf_test)
                        valid_probs = (n_average * valid_probs + valid_probs_curr)/(n_average+1) 
                        test_probs = (n_average * test_probs + test_probs_curr)/(n_average+1) 
                        n_average += 1
                        
                        valid_negll = -np.log(valid_probs + 1e-6).sum() / valid_num_words
                        test_negll = -np.log(test_probs + 1e-6).sum() / test_num_words
                        history_negll.append([valid_negll, test_negll])
                        
                        logger.info('Saving {}th Sample...'.format(n_average))
                        
                        params = unzip(tparams)
                        np.savez('flickr30k_result_psgld_{}.npz'.format(n_average), valid_probs_curr=valid_probs_curr, test_probs_curr=test_probs_curr, **params)
                        logger.info('Done ...')
                        
                    
                    if (uidx == 0 or
                        valid_negll <= np.array(history_negll)[:,0].min()):
                             
                        best_p = unzip(tparams)
                        
                        best_valid_negll = valid_negll
                        best_test_negll = test_negll
                        
                        bad_counter = 0
                        
                    logger.info('Perp: Valid {} Test {}'.format(np.exp(valid_negll), np.exp(test_negll)))

                    if (len(history_negll) > 10 and
                        valid_negll >= np.array(history_negll)[:-10,0].min()):
                            bad_counter += 1
                            if bad_counter > 10:
                                logger.info('Early Stop!')
                                estop = True
                                break

            logger.info('Seen {} samples'.format(n_samples))
            
            if estop:
                break

    except KeyboardInterrupt:
        logger.info('Training interupted')

    end_time = time.time()
    
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)
    
    logger.info('Perp: Valid {} Test {}'.format(np.exp(best_valid_negll), np.exp(best_test_negll)))
    np.savez(saveto, history_negll=history_negll, **best_p)

    
    logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, 
                 (end_time - start_time) / (1. * (eidx + 1))))
    
    return best_valid_negll, best_test_negll
Ejemplo n.º 3
0
def trainer(train,
            valid,
            test,
            n_chars=33,
            img_w=128,
            max_len=27,
            feature_maps=100,
            filter_hs=[2, 3, 4],
            max_epochs=20,
            gamma=10,
            ncon=50,
            lrate=0.0002,
            batch_size=100,
            dispFreq=10,
            validFreq=100,
            saveto='example.npz'):
    """ train, valid, test : datasets
        n_chars : vocabulary size
        img_w : character embedding dimension.
        max_len : the maximum length of a sentence 
        feature_maps : the number of feature maps we used 
        filter_hs: the filter window sizes we used
        max_epochs : The maximum number of epoch to run
        gamma: hyper-parameter using in ranking
        ncon: the number of negative samples we used for each postive sample
        lrate : learning rate
        batch_size : batch size during training
        dispFreq : Display to stdout the training progress every N updates
        validFreq : Compute the validation rank score after this number of update.
        saveto: where to save the result.
    """

    img_h = max_len + 2 * (filter_hs[-1] - 1)

    model_options = {}
    model_options['n_chars'] = n_chars
    model_options['img_w'] = img_w
    model_options['img_h'] = img_h
    model_options['feature_maps'] = feature_maps
    model_options['filter_hs'] = filter_hs
    model_options['max_epochs'] = max_epochs
    model_options['gamma'] = gamma
    model_options['ncon'] = ncon
    model_options['lrate'] = lrate
    model_options['batch_size'] = batch_size
    model_options['dispFreq'] = dispFreq
    model_options['validFreq'] = validFreq
    model_options['saveto'] = saveto

    logger.info('Model options {}'.format(model_options))

    logger.info('Building model...')

    filter_w = img_w
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1))

    model_options['filter_shapes'] = filter_shapes
    model_options['pool_sizes'] = pool_sizes

    params = init_params(model_options)
    tparams = init_tparams(params)

    use_noise, inps, cost = build_model(tparams, model_options)

    logger.info('Building encoder...')
    inps_e, feat_x, feat_y = build_encoder(tparams, model_options)

    logger.info('Building functions...')
    f_emb = theano.function(inps_e, [feat_x, feat_y], name='f_emb')

    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = Adam(tparams, cost, inps, lr)

    logger.info('Training model...')

    uidx = 0
    seed = 1234
    curr = 0
    history_errs = []

    valid_x = prepare_data(valid[0], max_len, n_chars, filter_hs[-1])
    valid_y = prepare_data(valid[1], max_len, n_chars, filter_hs[-1])

    test_x = prepare_data(test[0], max_len, n_chars, filter_hs[-1])
    test_y = prepare_data(test[1], max_len, n_chars, filter_hs[-1])

    zero_vec_tensor = tensor.vector()
    zero_vec = np.zeros(img_w).astype(theano.config.floatX)
    set_zero = theano.function([zero_vec_tensor],
                               updates=[(tparams['Wemb'],
                                         tensor.set_subtensor(
                                             tparams['Wemb'][n_chars - 1, :],
                                             zero_vec_tensor))])

    # Main loop
    for eidx in range(max_epochs):
        prng = RandomState(seed - eidx - 1)

        trainA = train[0]
        trainB = train[1]

        num_samples = len(trainA)

        inds = np.arange(num_samples)
        prng.shuffle(inds)
        numbatches = len(inds) / batch_size
        for minibatch in range(numbatches):
            use_noise.set_value(0.)
            uidx += 1
            conprng = RandomState(seed + uidx + 1)

            x = [trainA[seq] for seq in inds[minibatch::numbatches]]
            y = [trainB[seq] for seq in inds[minibatch::numbatches]]

            cinds = conprng.random_integers(low=0,
                                            high=num_samples - 1,
                                            size=ncon * len(x))
            cy = [trainB[seq] for seq in cinds]

            x = prepare_data(x, max_len, n_chars, filter_hs[-1])
            y = prepare_data(y, max_len, n_chars, filter_hs[-1])
            cy = prepare_data(cy, max_len, n_chars, filter_hs[-1])

            cost = f_grad_shared(x, y, cy)
            f_update(lrate)
            # the special token does not need to update.
            set_zero(zero_vec)

            if np.mod(uidx, dispFreq) == 0:
                logger.info('Epoch {} Update {} Cost {}'.format(
                    eidx, uidx, cost))

            if np.mod(uidx, validFreq) == 0:
                use_noise.set_value(0.)
                logger.info('Computing ranks...')

                feats_x, feats_y = f_emb(valid_x, valid_y)
                (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y)
                history_errs.append([r1, r3, r10, medr, meanr, h_meanr])

                logger.info('Valid Rank:{}, {}, {}, {},{},{}'.format(
                    r1, r3, r10, medr, meanr, h_meanr))

                currscore = r1 + r3 + r10
                if currscore > curr:
                    curr = currscore
                    logger.info('Saving...')
                    params = unzip(tparams)
                    np.savez(saveto, history_errs=history_errs, **params)
                    logger.info('Done...')

    use_noise.set_value(0.)
    zipp(params, tparams)
    logger.info('Final results...')

    feats_x, feats_y = f_emb(valid_x, valid_y)
    (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y)
    logger.info('Valid Rank:{}, {}, {}, {},{},{}'.format(
        r1, r3, r10, medr, meanr, h_meanr))

    feats_x, feats_y = f_emb(test_x, test_y)
    (r1, r3, r10, medr, meanr, h_meanr) = rank(feats_x, feats_y)
    logger.info('Test Rank:{}, {}, {}, {},{},{}'.format(
        r1, r3, r10, medr, meanr, h_meanr))

    # np.savez("./cnn_feats.npz", feats_x=feats_x, feats_y=feats_y)

    return (r1, r3, r10, medr, meanr, h_meanr)
Ejemplo n.º 4
0
def train_model(train, val, test, n_words=21103, img_w=300, max_len=40, 
    feature_maps=200, filter_hs=[3,4,5], n_x=300, n_h=600, 
    max_epochs=8, lrate=0.0002, batch_size=64, valid_batch_size=64, dispFreq=10, 
    validFreq=500, saveFreq=1000, saveto = 'bookcorpus_result.npz'):
        
    """ train, valid, test : datasets
        n_words : vocabulary size
        img_w : word embedding dimension, must be 300.
        max_len : the maximum length of a sentence 
        feature_maps : the number of feature maps we used 
        filter_hs: the filter window sizes we used
        n_x: word embedding dimension
        n_h: the number of hidden units in LSTM        
        max_epochs : the maximum number of epoch to run
        lrate : learning rate
        batch_size : batch size during training
        valid_batch_size : The batch size used for validation/test set
        dispFreq : Display to stdout the training progress every N updates
        validFreq : Compute the validation error after this number of update.
        saveFreq: save the result after this number of update.
        saveto: where to save the result.
    """
    
    img_h = max_len + 2*(filter_hs[-1]-1)
    
    options = {}
    options['n_words'] = n_words
    options['img_w'] = img_w
    options['img_h'] = img_h
    options['feature_maps'] = feature_maps
    options['filter_hs'] = filter_hs
    options['n_x'] = n_x
    options['n_h'] = n_h
    options['max_epochs'] = max_epochs
    options['lrate'] = lrate
    options['batch_size'] = batch_size
    options['valid_batch_size'] = valid_batch_size
    options['dispFreq'] = dispFreq
    options['validFreq'] = validFreq
    options['saveFreq'] = saveFreq
   
    logger.info('Model options {}'.format(options))

    logger.info('Building model...')
    
    filter_w = img_w
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1))
        
    options['filter_shapes'] = filter_shapes
    options['pool_sizes'] = pool_sizes
    
    params = init_params(options)
    tparams = init_tparams(params)

    use_noise, x, y, y_mask, cost = build_model(tparams,options)
    
    f_cost = theano.function([x, y, y_mask], cost, name='f_cost')
    
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = Adam(tparams, cost, [x, y, y_mask], lr)
    
    logger.info('Training model...')
    
    history_cost = []  
    uidx = 0  # the number of update done
    start_time = time.time()
    
    kf_valid = get_minibatches_idx(len(val), valid_batch_size)
    
    zero_vec_tensor = tensor.vector()
    zero_vec = np.zeros(img_w).astype(theano.config.floatX)
    set_zero = theano.function([zero_vec_tensor], updates=[(tparams['Wemb'], tensor.set_subtensor(tparams['Wemb'][21102,:], zero_vec_tensor))])
    
    try:
        for eidx in xrange(max_epochs):
            n_samples = 0
            
            kf = get_minibatches_idx(len(train), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(0.)

                sents = [train[t]for t in train_index]
                
                x = prepare_data_for_cnn(sents)
                y, y_mask = prepare_data_for_rnn(sents)
                n_samples += y.shape[1]

                cost = f_grad_shared(x, y, y_mask)
                f_update(lrate)
                # the special <pad_zero> token does not need to update.
                set_zero(zero_vec)

                if np.isnan(cost) or np.isinf(cost):
                    
                    logger.info('NaN detected')
                    return 1., 1., 1.

                if np.mod(uidx, dispFreq) == 0:
                    logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, np.exp(cost)))
                
                if np.mod(uidx, saveFreq) == 0:
                    
                    logger.info('Saving ...')
                    
                    params = unzip(tparams)
                    np.savez(saveto, history_cost=history_cost, **params)
                    
                    logger.info('Done ...')

                if np.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.)
                    
                    valid_cost = calu_cost(f_cost, prepare_data_for_cnn, prepare_data_for_rnn, val, kf_valid)
                    history_cost.append([valid_cost])
                        
                    logger.info('Valid {}'.format(np.exp(valid_cost)))

        logger.info('Seen {} samples'.format(n_samples))

    except KeyboardInterrupt:
        logger.info('Training interupted')

    end_time = time.time()
    
#    if best_p is not None:
#        zipp(best_p, tparams)
#    else:
#        best_p = unzip(tparams)
    
    
    use_noise.set_value(0.)
    valid_cost = calu_cost(f_cost, prepare_data_for_cnn, prepare_data_for_rnn, val, kf_valid)
    logger.info('Valid {}'.format(np.exp(valid_cost)))
    
    params = unzip(tparams)
    np.savez(saveto, history_cost=history_cost, **params)

    
    logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, 
                 (end_time - start_time) / (1. * (eidx + 1))))
    
    
    return valid_cost
def train_classifier(train, valid, test, W, n_words=10000, img_w=300, max_len=40, 
    feature_maps=100, filter_hs=[3,4,5], dropout_val=0.5, patience=10, 
    max_epochs=20, lrate=0.0002, batch_size=50, valid_batch_size=50, dispFreq=10, 
    validFreq=100, saveFreq=200, saveto = 'trec_cnn_result.npz'):
        
    """ train, valid, test : datasets
        W : the word embedding initialization
        n_words : vocabulary size
        img_w : word embedding dimension, must be 300.
        max_len : the maximum length of a sentence 
        feature_maps : the number of feature maps we used 
        filter_hs: the filter window sizes we used
	dropout_val: dropput probability
        patience : Number of epoch to wait before early stop if no progress
        max_epochs : The maximum number of epoch to run
        lrate : learning rate
        batch_size : batch size during training
        valid_batch_size : The batch size used for validation/test set
        dispFreq : Display to stdout the training progress every N updates
        validFreq : Compute the validation error after this number of update.
        saveFreq: save the result after this number of update.
        saveto: where to save the result.
    """

    img_h = max_len + 2*(filter_hs[-1]-1)
    
    options = {}
    options['n_words'] = n_words
    options['img_w'] = img_w
    options['img_h'] = img_h
    options['feature_maps'] = feature_maps
    options['filter_hs'] = filter_hs
    options['patience'] = patience
    options['max_epochs'] = max_epochs
    options['lrate'] = lrate
    options['batch_size'] = batch_size
    options['valid_batch_size'] = valid_batch_size
    options['dispFreq'] = dispFreq
    options['validFreq'] = validFreq
    
    logger.info('Model options {}'.format(options))
    
    logger.info('{} train examples'.format(len(train[0])))
    logger.info('{} valid examples'.format(len(valid[0])))
    logger.info('{} test examples'.format(len(test[0])))

    logger.info('Building model...')
    
    n_y = np.max(train[1]) + 1
    options['n_y'] = n_y
    
    """
    Train a simple conv net
    img_h = sentence length (padded where necessary)
    img_w = word vector length (300 for word2vec)
    filter_hs = filter window sizes    
    """ 

    filter_w = img_w
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1))
        
    options['filter_shapes'] = filter_shapes
    options['pool_sizes'] = pool_sizes
    
    params = init_params(options,W)
    tparams = init_tparams(params)

    (use_noise, x, y, f_pred_prob, f_pred, cost) = build_model(tparams,options)
    
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = Adam(tparams, cost, [x, y], lr)

    logger.info('Training model...')
    
    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    estop = False  # early stop
    history_errs = []
    best_p = None
    bad_counter = 0    
    uidx = 0  # the number of update done
    start_time = time.time()
    
    zero_vec_tensor = tensor.vector()
    zero_vec = np.zeros(img_w).astype(theano.config.floatX)
    set_zero = theano.function([zero_vec_tensor], updates=[(tparams['Wemb'], tensor.set_subtensor(tparams['Wemb'][n_words-1,:], zero_vec_tensor))])
    
    try:
        for eidx in xrange(max_epochs):

            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(dropout_val)

                y = np.array([train[1][t] for t in train_index]).astype('int32')
                x = [train[0][t]for t in train_index]
                x = prepare_data(x,max_len,n_words,filter_hs[-1])

                cost = f_grad_shared(x, y)
                f_update(lrate)
                # the special token does not need to update.
                set_zero(zero_vec)

                if np.isnan(cost) or np.isinf(cost):
                    logger.info('NaN detected')
                    return 1., 1., 1.

                if np.mod(uidx, dispFreq) == 0:
                    logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, cost))
                    
                if np.mod(uidx, saveFreq) == 0:
                    logger.info('Saving ...')
                    
                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                        np.savez(saveto, history_errs=history_errs, **params)
                    
                    logger.info('Done ...')

                if np.mod(uidx, validFreq) == 0:
                    
                    use_noise.set_value(0.)
                    
                    train_err = pred_error(f_pred, prepare_data, train, kf, max_len,n_words, filter_hs[-1])
                    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid, max_len,n_words, filter_hs[-1])
                    test_err = pred_error(f_pred, prepare_data, test, kf_test, max_len,n_words, filter_hs[-1])
                    history_errs.append([valid_err, test_err, train_err])
                   
                    if (uidx == 0 or
                        valid_err <= np.array(history_errs)[:,0].min()):

                        best_p = unzip(tparams)
                        bad_counter = 0

                    logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err))

                    if (len(history_errs) > patience and
                        valid_err >= np.array(history_errs)[:-patience,0].min()):
                        bad_counter += 1
                        if bad_counter > patience:
                            
                            logger.info('Early Stop!')
                            estop = True
                            break

            if estop:
                break

    except KeyboardInterrupt:
        logger.info('Training interupted')

    end_time = time.time()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)
    
    use_noise.set_value(0.)
    
    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
    train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted, max_len,n_words, filter_hs[-1])
    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid, max_len,n_words, filter_hs[-1])
    test_err = pred_error(f_pred, prepare_data, test, kf_test, max_len,n_words, filter_hs[-1])

    logger.info('Train {} Valid {} Test {}'.format(train_err, valid_err, test_err))
    
    np.savez(saveto, train_err=train_err,
             valid_err=valid_err, test_err=test_err,
             history_errs=history_errs, **best_p)
    
    logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, 
                 (end_time - start_time) / (1. * (eidx + 1))))
    
    return train_err, valid_err, test_err
Ejemplo n.º 6
0
def train_model(train,
                val,
                test,
                train_lab,
                val_lab,
                test_lab,
                ixtoword,
                n_words=22153,
                period=887,
                img_w=300,
                img_h=148,
                feature_maps=300,
                filter_hs=[3, 4, 5],
                n_x=300,
                n_h=500,
                n_h2_d=200,
                n_h2=900,
                p_lambda_q=0,
                p_lambda_fm=0.001,
                p_lambda_recon=0.001,
                n_codes=2,
                max_epochs=16,
                lr_d=0.0001,
                lr_g=0.00005,
                kde_sigma=1.,
                batch_size=256,
                valid_batch_size=256,
                dim_mmd=32,
                dispFreq=10,
                dg_ratio=1,
                Large=1e3,
                validFreq=500,
                saveFreq=500,
                saveto='disent_result'):
    """ n_words : word vocabulary size
        feature_maps : CNN embedding dimension for each width
        filter_hs : CNN width
        n_h : LSTM/GRU number of hidden units 
        n_h2: discriminative network number of hidden units
        n_gan: number of hidden units in GAN
        n_codes: number of latent codes 
        max_epochs : The maximum number of epoch to run
        lrate : learning rate
        batch_size : batch size during training
        valid_batch_size : The batch size used for validation/test set
        dispFreq : Display to stdout the training progress every N updates
        validFreq : Compute the validation error after this number of update.
    """
    n_gan = len(filter_hs) * feature_maps  # 900

    options = {}
    options['n_words'] = n_words
    options['img_w'] = img_w
    options['img_h'] = img_h
    options['feature_maps'] = feature_maps
    options['filter_hs'] = filter_hs  #band width
    options['n_x'] = n_x
    options['n_h'] = n_h
    options['n_h2'] = n_h2
    options['n_h2_d'] = n_h2_d
    options['n_codes'] = n_codes
    options['lambda_q'] = p_lambda_q
    options['lambda_fm'] = p_lambda_fm  # weight for feature matching
    options['lambda_recon'] = p_lambda_recon
    options['L'] = Large
    options['max_epochs'] = max_epochs
    options['lr_d'] = lr_d
    options['lr_g'] = lr_g
    options['kde_sigma'] = kde_sigma
    options['batch_size'] = batch_size
    options['valid_batch_size'] = valid_batch_size
    options['dispFreq'] = dispFreq
    options['validFreq'] = validFreq
    options['saveFreq'] = saveFreq
    options['dg_ratio'] = dg_ratio

    options['n_gan'] = n_gan
    options['debug'] = False
    options[
        'feature_match'] = 'mmd'  #'mmd' #' mmd_h','mmd_ld' #'JSD_acc'  # moment  #None  #
    options['shareLSTM'] = True
    options['delta'] = 0.00
    options['sharedEmb'] = False
    options['cnn_activation'] = 'tanh'  # tanh
    options['sigma_range'] = [20]  # range of sigma for mmd
    options['diag'] = 0.1  # diagonal matrix added on cov for JSD_acc
    options['label_smoothing'] = 0.01
    options['dim_mmd'] = dim_mmd
    options['force_cut'] = 'None'
    options['batch_norm'] = False
    options['wgan'] = False
    options['cutoff'] = 0.01

    options['max_step'] = 60
    options['period'] = period

    logger.info('Model options {}'.format(options))

    logger.info('Building model...')

    filter_w = img_w
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1))

    options['filter_shapes'] = filter_shapes
    options['pool_sizes'] = pool_sizes
    # generative model for GAN

    ## modified
    n_label = len(set(train_lab))
    options['label_sizes'] = n_label

    n_feature = len(options['filter_hs']) * options['feature_maps']

    options['input_shape'] = (n_h2_d, n_feature)
    options['pred_shape'] = (1, n_h2_d)

    if options['feature_match'] == 'mmd_ld':
        options['mmd_shape'] = (dim_mmd, n_h2_d)

    options['propose_shape'] = (n_codes, n_h2_d)

    # if options['reverse']:
    options['input_recon_shape'] = (n_h2, n_feature)
    options['recon_shape'] = (n_gan,
                              n_h2) if options['shareLSTM'] else (n_gan + 1,
                                                                  n_h2)
    ##

    d_params_s, g_params_s, s_params_s = init_params(options)
    d_params, g_params, s_params = init_tparams(d_params_s, g_params_s,
                                                s_params_s, options)
    lr_d_t = tensor.scalar(name='lr_d')
    lr_g_t = tensor.scalar(name='lr_g')

    use_noise, use_noise2, x, z, d_cost, g_cost, r_cost, fake_recon, acc_fake_xx, acc_real_xx, acc_fake_mean, acc_real_mean, wtf1, wtf2, wtf3, wtf4, wtf5, wtf6, KDE, KDE_input = build_model(
        d_params, g_params, s_params, options)  # change
    f_cost = theano.function([x, z], [d_cost, g_cost, KDE, KDE_input],
                             name='f_cost')
    #f_print = theano.function([x, z],[ wtf1, wtf2, wtf3, wtf4, wtf5, wtf6, KDE, KDE_input], name='f_print',on_unused_input='ignore')
    f_print = theano.function([x, z], [wtf1, wtf2, wtf3, wtf4, wtf5, wtf6],
                              name='f_print')
    f_recon = theano.function([x, z], [r_cost, fake_recon, d_cost],
                              name='f_recon',
                              on_unused_input='ignore')

    if options['feature_match']:
        ss_updates = [(s_params['acc_fake_xx'], acc_fake_xx),
                      (s_params['acc_real_xx'], acc_real_xx),
                      (s_params['acc_fake_mean'], acc_fake_mean),
                      (s_params['acc_real_mean'], acc_real_mean),
                      (s_params['seen_size'],
                       s_params['seen_size'] + options['batch_size'])]
        f_update_ss = theano.function([x, z], s_params, updates=ss_updates)

    f_cost_d, _train_d = Adam(d_params, d_cost, [x, z], lr_d_t)
    if options['feature_match']:
        f_cost_g, _train_g = Adam(g_params, g_cost, [x, z], lr_g_t)
    else:
        f_cost_g, _train_g = Adam(g_params, g_cost, [z], lr_g_t)

    ##

    logger.info('Training model...')

    history_cost = []
    uidx = 0  # the number of update done
    kdes = np.zeros(10)
    kde_std = 0.  # standard deviation of every 10 kde_input
    kde_mean = 0.

    start_time = time.time()

    kf_valid = get_minibatches_idx(len(val), valid_batch_size)
    y_min = min(train_lab)
    train_lab = [t - y_min for t in train_lab]
    val_lab = [t - y_min for t in val_lab]
    test_lab = [t - y_min for t in test_lab]
    testset = [prepare_for_bleu(s) for s in test[:1000]]
    try:
        for eidx in xrange(max_epochs):
            n_samples = 0

            kf = get_minibatches_idx(len(train), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(0.0)
                use_noise2.set_value(0.0)
                sents = [train[t] for t in train_index]

                x = prepare_data_for_cnn(sents)
                n_samples += x.shape[0]

                if options['shareLSTM']:
                    z = np.random.uniform(
                        -1, 1, (batch_size, n_gan)).astype('float32')
                else:
                    z = np.random.uniform(
                        -1, 1, (batch_size, n_gan + 1)).astype('float32')

                z[:, 0] = np.random.randint(n_codes,
                                            size=batch_size).astype('float32')

                # update gradient
                if options['feature_match']:
                    cost_g = f_cost_g(x, z)
                else:
                    cost_g = f_cost_g(z)

                if np.isnan(cost_g):

                    logger.info('NaN detected')
                    temp_out = f_print(x, z)

                    print 'real' + str(temp_out[0]) + ' fake' + str(
                        temp_out[1])
                    return 1., 1., 1.

                if np.isinf(cost_g):
                    temp_out = f_print(x, z)
                    print 'real' + str(temp_out[0]) + ' fake' + str(
                        temp_out[1])
                    logger.info('Inf detected')
                    return 1., 1., 1.

                # update G
                _train_g(lr_g)

                if np.mod(uidx, dispFreq) == 0:
                    temp_out = f_print(x, z)
                    _, _, cost_d = f_recon(x, z)

                    np.set_printoptions(precision=3)
                    np.set_printoptions(threshold=np.inf)

                    print 'real ' + str(round(
                        temp_out[0], 2)) + ' fake ' + str(round(
                            temp_out[1], 2)) + ' Covariance loss ' + str(
                                round(temp_out[3], 2)) + ' mean loss ' + str(
                                    round(temp_out[5], 2))
                    print 'cost_g ' + str(cost_g) + ' cost_d ' + str(cost_d)
                    print(
                        "Generated:" + " ".join(
                            [ixtoword[x] for x in temp_out[2][0] if x != 0]))

                    logger.info(
                        'Epoch {} Update {} Cost G {} Real {} Fake {} loss_cov {}  meanMSE {}'
                        .format(eidx, uidx, cost_g, round(temp_out[0], 2),
                                round(temp_out[1], 2), temp_out[3],
                                temp_out[5]))
                    logger.info('Generated: {}'.format(" ".join(
                        [ixtoword[x] for x in temp_out[2][0] if x != 0])))

                if np.mod(uidx, dg_ratio) == 0:
                    x = prepare_data_for_cnn(sents)
                    cost_d = f_cost_d(x, z)
                    _train_d(lr_d)

                    if np.mod(uidx, dispFreq) == 0:
                        logger.info('Cost D {}'.format(cost_d))

                if np.mod(uidx, saveFreq) == 0:

                    logger.info('Saving ...')

                    d_params_s = unzip(d_params)
                    g_params_s = unzip(g_params)
                    params_d = OrderedDict()
                    params_g = OrderedDict()
                    for kk, pp in d_params_s.iteritems():
                        params_d[kk] = np.asarray(d_params_s[kk])
                    for kk, pp in g_params_s.iteritems():
                        params_g[kk] = np.asarray(g_params_s[kk])

                    np.savez(saveto + '_d.npz',
                             history_cost=history_cost,
                             options=options,
                             **params_d)
                    np.savez(saveto + '_g.npz',
                             history_cost=history_cost,
                             options=options,
                             **params_g)

                    logger.info('Done ...')

                if np.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.)
                    use_noise2.set_value(0.)
                    if options['shareLSTM']:
                        val_z = np.random.uniform(
                            -1, 1, (batch_size, n_gan)).astype('float32')
                    else:
                        val_z = np.random.uniform(
                            -1, 1, (batch_size, n_gan + 1)).astype('float32')

                    temp_out = f_print(x, val_z)
                    predset = temp_out[2]
                    [bleu2s, bleu3s,
                     bleu4s] = cal_BLEU([prepare_for_bleu(s) for s in predset],
                                        {0: testset})

                    logger.info(
                        'Valid BLEU2 = {}, BLEU3 = {}, BLEU4 = {}'.format(
                            bleu2s, bleu3s, bleu4s))
                    print 'Valid BLEU (2,3,4): ' + ' '.join(
                        [str(round(it, 3)) for it in (bleu2s, bleu3s, bleu4s)])

                if options['feature_match']:
                    f_update_ss(x, z)

        logger.info('Seen {} samples'.format(n_samples))

    except KeyboardInterrupt:
        logger.info('Training interrupted')

    end_time = time.time()

    logger.info('The code run for {} epochs, with {} sec/epochs'.format(
        eidx + 1, (end_time - start_time) / (1. * (eidx + 1))))

    return valid_cost
Ejemplo n.º 7
0
                if np.isnan(cost) or np.isinf(cost):
                    print('NaN detected')
                    [train_err, valid_err, test_err] = [1., 1., 1.]

                if np.mod(uidx, dispFreq) == 0:
                    print('Epoch {} Update {} Cost {}'.format(
                        eidx, uidx, cost))

                if np.mod(uidx, saveFreq) == 0:
                    print('Saving ...')

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                        np.savez(saveto, history_errs=history_errs, **params)

                    with open('data/sst2_model.pickle', 'wb') as file:
                        model = params
                        cPickle.dump(model, file)
                    print('Done ...')

                if np.mod(uidx, validFreq) == 0:

                    use_noise.set_value(0.)

                    train_err = pred_error(f_pred, prepare_data, train, kf)
                    valid_err = pred_error(f_pred, prepare_data, valid,
                                           kf_valid)
                    test_err = pred_error(f_pred, prepare_data, test, kf_test)