Ejemplo n.º 1
0
def paralleltrain(epoch):
    model.train()
    scheduler.step()
    for batch_idx, (data, target) in enumerate(train_loader):
        if batch_idx % mv.workers_num() != mv.worker_id():
            continue
        if args.cuda:
            data, target = data.cuda(device), target.cuda(device)
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        model.cpu()
        model.mv_sync()
        model.cuda(device)

        if (batch_idx / mv.workers_num()) % args.log_interval == 0:
            print(
                'Worker: {}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.
                format(mv.worker_id(), epoch, batch_idx * len(data),
                       len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss.data[0]))

    if batch_idx % mv.workers_num() < mv.worker_id():
        optimizer.zero_grad()
        model.cpu()
        model.mv_sync()
        model.cuda(device)
Ejemplo n.º 2
0
def train(epoch):
    print('\nEpoch: %d' % epoch)
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        if batch_idx % mv.workers_num() == mv.worker_id():
            if use_cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
            optimizer.zero_grad()
            inputs, targets = Variable(inputs), Variable(targets)
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            net.cpu()
            net.mv_sync()
            net.cuda()

            train_loss += loss.data[0]
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += predicted.eq(targets.data).cpu().sum()

            if (batch_idx / mv.workers_num()) % args.log_interval == 0:
                print(
                    'Worker: {}\tTrain Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'
                    .format(mv.worker_id(), epoch, batch_idx * len(inputs),
                            len(trainloader.dataset),
                            100. * batch_idx / len(trainloader), loss.data[0]))
Ejemplo n.º 3
0
def test(epoch):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        if args.cuda:
            data, target = data.cuda(device), target.cuda(device)
        data, target = Variable(data, volatile=True), Variable(target)
        output = model(data)
        test_loss += criterion(output, target).data[0]
        pred = output.data.max(1)[
            1]  # get the index of the max log-probability
        correct += pred.eq(target.data).cpu().sum()

    test_loss = test_loss
    test_loss /= len(
        test_loader)  # loss function already averages over batch size
    if args.parallel:
        print(
            '\nWorker: {}\tTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'
            .format(mv.worker_id(), test_loss, correct,
                    len(test_loader.dataset),
                    100. * correct / len(test_loader.dataset)))
    else:
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.
              format(test_loss, correct, len(test_loader.dataset),
                     100. * correct / len(test_loader.dataset)))
Ejemplo n.º 4
0
def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(testloader):
        if use_cuda:
            inputs, targets = inputs.cuda(), targets.cuda()
        inputs, targets = Variable(inputs, volatile=True), Variable(targets)
        outputs = net(inputs)
        loss = criterion(outputs, targets)

        test_loss += loss.data[0]
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += predicted.eq(targets.data).cpu().sum()

    print(
        '\nWorker: {}\tTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'
        .format(mv.worker_id(), test_loss, correct, len(testloader.dataset),
                100. * correct / len(testloader.dataset)))

    # Save checkpoint.
    acc = 100. * correct / total
    if acc > best_acc:
        print('Saving..')
        if use_cuda:
            net.cpu()
        state = {
            'net': net,
            'acc': acc,
            'epoch': epoch,
        }
        if not os.path.isdir('checkpoint'):
            os.mkdir('checkpoint')
        torch.save(state, './checkpoint/ckpt.t7')
        best_acc = acc
        if use_cuda:
            net.cuda()
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(
        description='Train the deep NMT model.',
        fromfile_prefix_chars='@',
    )

    parser.add_argument('-R', action="store_false", default=True, dest='reload',
                        help='Reload old model, default to True, set to False')
    parser.add_argument('-d', action='store_true', default=False, dest='dump_before_train',
                        help='Dump before train default to False, set to True')
    parser.add_argument('--lr', action="store", metavar="learning_rate", dest="learning_rate", type=float, default=1.0,
                        help='Start learning rate, default is %(default)s')
    parser.add_argument('--optimizer', action='store', default='adadelta')
    parser.add_argument('--plot', action='store', default=None,
                        help='Plot filename, default is None (not plot) (deprecated).')
    parser.add_argument('--save_freq', action='store', default=10000, type=int, dest='save_freq',
                        help='Model save frequency, default is %(default)s')
    parser.add_argument('--dev_bleu_freq', action='store', default=20000, type=int, dest='dev_bleu_freq',
                        help='Get dev set BLEU frequency, default is %(default)s')
    parser.add_argument('--dim', action='store', default=512, type=int, dest='dim',
                        help='Dim of hidden units, default is %(default)s')
    parser.add_argument('--bs', action='store', default=128, type=int, dest='batch_size',
                        help='Train batch size, default is %(default)s')
    parser.add_argument('--valid_bs', action='store', default=128, type=int, dest='valid_batch_size',
                        help='Valid batch size, default is %(default)s')
    parser.add_argument('--dim_word', action='store', default=512, type=int, dest='dim_word',
                        help='Dim of word embedding, default is %(default)s')
    parser.add_argument('--maxlen', action='store', default=80, type=int, dest='maxlen',
                        help='Max sentence length, default is %(default)s')
    parser.add_argument('-S', action='store_false', default=True, dest='shuffle',
                        help='Shuffle data per epoch, default is True, set to False')
    parser.add_argument('--train1', action='store', metavar='filename', dest='train1', type=str,
                        default='filtered_en-fr.en',
                        help='Source train file, default is %(default)s')
    parser.add_argument('--train2', action='store', metavar='filename', dest='train2', type=str,
                        default='filtered_en-fr.fr',
                        help='Target train file, default is %(default)s')
    parser.add_argument('--small1', action='store', metavar='filename', dest='small1', type=str,
                        default='small_en-fr.en',
                        help='Source small train file, default is %(default)s')
    parser.add_argument('--small2', action='store', metavar='filename', dest='small2', type=str,
                        default='small_en-fr.fr',
                        help='Target small train file, default is %(default)s')
    parser.add_argument('--valid1', action='store', metavar='filename', dest='valid1', type=str,
                        default='dev_en.tok',
                        help='Source valid file, default is %(default)s')
    parser.add_argument('--valid2', action='store', metavar='filename', dest='valid2', type=str,
                        default='dev_fr.tok',
                        help='Target valid file, default is %(default)s')
    parser.add_argument('--dic1', action='store', metavar='filename', dest='dic1', type=str,
                        default='filtered_dic_en-fr.en.pkl',
                        help='Source dict file, default is %(default)s')
    parser.add_argument('--dic2', action='store', metavar='filename', dest='dic2', type=str,
                        default='filtered_dic_en-fr.fr.pkl',
                        help='Target dict file, default is %(default)s')
    parser.add_argument('--n_words_src', action='store', default=30000, type=int, dest='n_words_src',
                        help='Vocabularies in source side, default is %(default)s')
    parser.add_argument('--n_words_tgt', action='store', default=30000, type=int, dest='n_words_tgt',
                        help='Vocabularies in target side, default is %(default)s')

    parser.add_argument('model_file', nargs='?', default='model/baseline/baseline.npz',
                        help='Generated model file, default is "%(default)s"')
    parser.add_argument('pre_load_file', nargs='?', default='model/en2fr.iter160000.npz',
                        help='Pre-load model file, default is "%(default)s"')
    parser.add_argument('--src_vocab_map', action='store', metavar='filename', dest='src_vocab_map_file', type=str,
                        default=None, help='The file containing source vocab mapping information' 
                                           'used to initialize a model on large dataset from small one')
    parser.add_argument('--tgt_vocab_map', action='store', metavar='filename', dest='tgt_vocab_map_file', type=str,
                        default=None, help='The file containing target vocab mapping information'
                                           'used to initialize a model on large dataset from small one')

    parser.add_argument('--enc', action='store', default=1, type=int, dest='n_encoder_layers',
                        help='Number of encoder layers, default is 1')
    parser.add_argument('--dec', action='store', default=1, type=int, dest='n_decoder_layers',
                        help='Number of decoder layers, default is 1')
    parser.add_argument('--conn', action='store', default=2, type=int, dest='connection_type',
                        help='Connection type, '
                             'default is 2 (bidirectional only in first layer, other layers are forward);'
                             '1 is divided bidirectional GRU')
    parser.add_argument('--max_epochs', action='store', default=100, type=int, dest='max_epochs',
                        help='Maximum epoches, default is 100')
    parser.add_argument('--unit', action='store', metavar='unit', dest='unit', type=str, default='lstm',
                        help='The unit type, default is "lstm", can be set to "gru".')
    parser.add_argument('--attention', action='store', metavar='index', dest='attention_layer_id', type=int, default=0,
                        help='Attention layer index, default is 0')
    parser.add_argument('--residual_enc', action='store', metavar='type', dest='residual_enc', type=str, default=None,
                        help='Residual connection of encoder, default is None, candidates are "layer_wise", "last"')
    parser.add_argument('--residual_dec', action='store', metavar='type', dest='residual_dec', type=str,
                        default='layer_wise',
                        help='Residual connection of decoder, default is "layer_wise", candidates are None, "last"')
    parser.add_argument('-z', '--zigzag', action='store_false', default=True, dest='use_zigzag',
                        help='Use zigzag in encoder, default is True, set to False')
    parser.add_argument('--dropout', action="store", metavar="dropout", dest="dropout", type=float, default=False,
                        help='Dropout rate, default is False (not use dropout)')
    parser.add_argument('--unit_size', action='store', default=2, type=int, dest='unit_size',
                        help='Number of unit size, default is %(default)s')
    # TODO: rename this option to decoder_unit_size in future
    parser.add_argument('--cond_unit_size', action='store', default=2, type=int, dest='cond_unit_size',
                        help='Number of decoder unit size (will rename in future), default is %(default)s')
    parser.add_argument('--clip', action='store', metavar='clip', dest='clip', type=float, default=1.0,
                        help='Gradient clip rate, default is 1.0.')
    parser.add_argument('--manual', action='store_false', dest='auto', default=True,
                        help='Set dropout rate and grad clip rate manually.')
    parser.add_argument('--emb', action='store', metavar='filename', dest='given_embedding', type=str, default=None,
                        help='Given embedding model file, default is None')
    parser.add_argument('--lr_discount', action='store', metavar='freq', dest='lr_discount_freq', type=int,
                        default=-1, help='The learning rate discount frequency, default is -1')

    parser.add_argument('--distribute', action = 'store', metavar ='type', dest = 'dist_type', type = str, default= None,
                        help = 'The distribution version, default is None (singe GPU mode), candiates are "mv", "mpi_reduce"')
    parser.add_argument('--nccl', action="store_true", default=False, dest='nccl',
                        help='Use NCCL in distributed mode, default to False, set to True')
    parser.add_argument('--clip_grads_local', action="store_true", default=False, dest='clip_grads_local',
                        help='Whether to clip grads in distributed mode, default to False, set to True')
    parser.add_argument('--recover_lr_iter', action='store', dest='dist_recover_lr', type = int, default=10000,
                        help='The mini-batch index to recover lrate in distributed mode, default is 10000.')

    parser.add_argument('--all_att', action='store_true', dest='all_att', default=False,
                        help='Generate attention from all decoder layers, default is False, set to True')
    parser.add_argument('--avg_ctx', action='store_true', dest='avg_ctx', default=False,
                        help='Average all context vectors to get softmax, default is False, set to True')
    parser.add_argument('--dataset', action='store', dest='dataset', default='en-fr',
                        help='Dataset, default is "%(default)s"')
    parser.add_argument('--gpu_map_file', action='store', metavar='filename', dest='gpu_map_file', type=str,
                        default=None, help='The file containing gpu id mapping information, '
                                           'each line is in the form physical_gpu_id\\theano_id')
    parser.add_argument('--ft_patience', action='store', metavar='N', dest='fine_tune_patience', type=int, default=-1,
                        help='Fine tune patience, default is %(default)s, set 8 to enable it')
    parser.add_argument('--valid_freq', action='store', metavar='N', dest='valid_freq', type=int, default=5000,
                        help='Validation frequency, default is 5000')
    parser.add_argument('--trg_att', action='store', metavar='N', dest='trg_attention_layer_id', type=int, default=None,
                        help='Target attention layer id, default is None (not use target attention)')
    parser.add_argument('--fix_dp_bug', action="store_true", default=False, dest='fix_dp_bug',
                        help='Fix previous dropout bug, default to False, set to True')
    parser.add_argument('--abandon_imm', action="store_true", default=False, dest='abandon_imm',
                        help='Whether to load previous immediate params, default to True, set to False')
    parser.add_argument('--tp', action="store", metavar="temperature", dest="temperature", type=float, default=1.0,
                        help='temperature, default is %(default)s')
    parser.add_argument('--scale', action="store", metavar="scale", dest="scale", type=float, default=1.0,
                        help='scale, default is %(default)s')
    parser.add_argument('--gate_dp', action="store", metavar="gate_dropout", dest="gate_dropout", type=float, default=1.0,
                        help='gate_dropout, default is %(default)s')

    args = parser.parse_args()
    print args

    if args.residual_enc == 'None':
        args.residual_enc = None
    if args.residual_dec == 'None':
        args.residual_dec = None
    if args.dist_type != 'mv' and args.dist_type != 'mpi_reduce':
        args.dist_type = None

    # FIXME: Auto mode
    if args.auto:
        if args.n_encoder_layers <= 2:
            args.dropout = False
            args.clip = 1.0
        else:
            args.dropout = 0.1
            args.clip = 5.0

        if args.n_encoder_layers <= 1:
            args.residual_enc = None
        if args.n_decoder_layers <= 1:
            args.residual_dec = None
            args.attention_layer_id = 0

        args.cond_unit_size = args.unit_size

    # If dataset is not 'en-fr', old value of dataset options like 'args.train1' will be omitted
    if args.dataset != 'en-fr':
        args.train1, args.train2, args.small1, args.small2, args.valid1, args.valid2, args.valid3, args.test1, args.test2, args.dic1, args.dic2 = \
            Datasets[args.dataset]

    print 'Command line arguments:'
    print args
    sys.stdout.flush()

    # Init multiverso or mpi and set theano flags.
    if args.dist_type == 'mv':
        try:
            import multiverso as mv
        except ImportError:
            import libs.multiverso_ as mv

        # FIXME: This must before the import of theano!
        mv.init(sync=True)
        worker_id = mv.worker_id()
        workers_cnt = mv.workers_num()
    elif args.dist_type == 'mpi_reduce':
        from mpi4py import MPI

        communicator = MPI.COMM_WORLD
        worker_id = communicator.Get_rank()
        workers_cnt = communicator.Get_size()

    if args.dist_type:
        available_gpus = get_gpu_usage(workers_cnt)
        gpu_maps_info = {idx: idx for idx in available_gpus}
        if args.gpu_map_file:
            for line in open(os.path.join('resources', args.gpu_map_file), 'r'):
                phy_id, theano_id = line.split()
                gpu_maps_info[int(phy_id)] = int(theano_id)
        theano_id = gpu_maps_info[available_gpus[worker_id]]
        print 'worker id:%d, using theano id:%d, physical id %d' % (worker_id, theano_id, available_gpus[worker_id])
        os.environ['THEANO_FLAGS'] = 'device=cuda{},floatX=float32'.format(theano_id)
        sys.stdout.flush()

    from libs.nmt import train

    train(
        max_epochs= args.max_epochs,
        saveto=args.model_file,
        preload=args.pre_load_file,
        reload_=args.reload,
        dim_word=args.dim_word,
        dim=args.dim,
        decay_c=0.,
        clip_c=args.clip,
        lrate=args.learning_rate,
        optimizer=args.optimizer,
        maxlen=args.maxlen,
        batch_size=args.batch_size,
        valid_batch_size=args.valid_batch_size,
        dispFreq=1,
        saveFreq=args.save_freq,
        validFreq=args.valid_freq,
        datasets=(r'data/train/{}'.format(args.train1),
                  r'data/train/{}'.format(args.train2)),
        valid_datasets=(r'data/dev/{}'.format(args.valid1),
                        r'data/dev/{}'.format(args.valid2)),
        small_train_datasets=(r'data/test/{}'.format(args.small1),r'data/test/{}'.format(args.small2),
                              r'data/test/{}'.format(args.test2)),
        vocab_filenames=(r'data/dic/{}'.format(args.dic1),
                         r'data/dic/{}'.format(args.dic2)),
        task=args.dataset,
        use_dropout=args.dropout,
        overwrite=False,
        n_words=args.n_words_tgt,
        n_words_src=args.n_words_src,

        # Options from v-yanfa
        dump_before_train=args.dump_before_train,
        plot_graph=args.plot,
        lr_discount_freq=args.lr_discount_freq,

        n_encoder_layers=args.n_encoder_layers,
        n_decoder_layers=args.n_decoder_layers,
        encoder_many_bidirectional=args.connection_type == 1,

        attention_layer_id=args.attention_layer_id,
        unit=args.unit,
        residual_enc=args.residual_enc,
        residual_dec=args.residual_dec,
        use_zigzag=args.use_zigzag,
        given_embedding=args.given_embedding,

        unit_size=args.unit_size,
        cond_unit_size=args.cond_unit_size,

        given_imm = not args.abandon_imm,
        dump_imm=True,
        shuffle_data=args.shuffle,

        decoder_all_attention=args.all_att,
        average_context=args.avg_ctx,

        dist_type=args.dist_type,
        dist_recover_lr_iter = args.dist_recover_lr,

        fine_tune_patience=args.fine_tune_patience,
        nccl= args.nccl,
        src_vocab_map_file= args.src_vocab_map_file,
        tgt_vocab_map_file= args.tgt_vocab_map_file,

        trg_attention_layer_id=args.trg_attention_layer_id,
        dev_bleu_freq = args.dev_bleu_freq,
        fix_dp_bug= args.fix_dp_bug,
        temperature=args.temperature,
        scale=args.scale,
        gate_dropout=args.gate_dropout,
    )
Ejemplo n.º 6
0
    c1 = T.maximum(0, conv.conv2d(x, w_c1) + b_c1.dimshuffle('x', 0, 'x', 'x'))
    p1 = downsample.max_pool_2d(c1, (3, 3))

    c2 = T.maximum(0,
                   conv.conv2d(p1, w_c2) + b_c2.dimshuffle('x', 0, 'x', 'x'))
    p2 = downsample.max_pool_2d(c2, (2, 2))

    p2_flat = p2.flatten(2)
    h3 = T.maximum(0, T.dot(p2_flat, w_h3) + b_h3)
    p_y_given_x = T.nnet.softmax(T.dot(h3, w_o) + b_o)
    return p_y_given_x


# MULTIVERSO: you should call mv.init before call multiverso apis
mv.init()
worker_id = mv.worker_id()
# MULTIVERSO: every process has distinct worker id
workers_num = mv.workers_num()

w_c1 = init_weights((4, 3, 3, 3), name="w_c1")
b_c1 = init_weights((4, ), name="b_c1")
w_c2 = init_weights((8, 4, 3, 3), name="w_c2")
b_c2 = init_weights((8, ), name="b_c2")
w_h3 = init_weights((8 * 4 * 4, 100), name="w_h3")
b_h3 = init_weights((100, ), name="b_h3")
w_o = init_weights((100, 10), name="w_o")
b_o = init_weights((10, ), name="b_o")

params = [w_c1, b_c1, w_c2, b_c2, w_h3, b_h3, w_o, b_o]

p_y_given_x = model(x, *params)
Ejemplo n.º 7
0
def train(dim_word=100,  # word vector dimensionality
          dim=1000,  # the number of LSTM units
          encoder='gru',
          decoder='gru_cond',
          n_words_src=30000,
          n_words=30000,
          max_epochs=5000,
          finish_after=10000000,  # finish after this many updates
          dispFreq=100,
          decay_c=0.,  # L2 regularization penalty
          alpha_c=0.,  # alignment regularization
          clip_c=-1.,  # gradient clipping threshold
          lrate=1.,  # learning rate
          maxlen=100,  # maximum length of the description
          optimizer='rmsprop',
          batch_size=16,
          valid_batch_size=80,
          saveto='model.npz',
          saveFreq=1000,  # save the parameters after every saveFreq updates
          validFreq=2500,
          dev_bleu_freq=20000,
          datasets=('/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.en.tok',
                    '/data/lisatmp3/chokyun/europarl/europarl-v7.fr-en.fr.tok'),
          valid_datasets=('./data/dev/dev_en.tok',
                          './data/dev/dev_fr.tok'),
          small_train_datasets=('./data/train/small_en-fr.en','./data/train/small_en-fr.fr',
                                './data/train/small_en-fr.fr'),
          use_dropout=False,
          reload_=False,
          overwrite=False,
          preload='',

          # Options below are from v-yanfa
          dump_before_train=True,
          plot_graph=None,
          vocab_filenames=('./data/dic/filtered_dic_en-fr.en.pkl',
                           './data/dic/filtered_dic_en-fr.fr.pkl'),
          map_filename='./data/dic/mapFullVocab2Top1MVocab.pkl',
          lr_discount_freq=80000,

          # Options of deeper encoder and decoder
          n_encoder_layers=1,
          n_decoder_layers=1,
          encoder_many_bidirectional=True,

          attention_layer_id=0,
          unit='gru',
          residual_enc=None,
          residual_dec=None,
          use_zigzag=False,

          initializer='orthogonal',
          given_embedding=None,

          dist_type=None,
          dist_recover_lr_iter=False,

          unit_size=2,
          cond_unit_size=2,

          given_imm=False,
          dump_imm=False,
          shuffle_data=False,

          decoder_all_attention=False,
          average_context=False,
          task='en-fr',

          fine_tune_patience=8,
          nccl = False,
          src_vocab_map_file = None,
          tgt_vocab_map_file = None,

          trg_attention_layer_id=None,
          fix_dp_bug = False,
          temperature = 1.0,
          scale=1.0,
          gate_dropout=0.0,
          ):
    model_options = locals().copy()

    # Set distributed computing environment
    worker_id = 0
    if dist_type == 'mv':
        try:
            import multiverso as mv
        except ImportError:
            from . import multiverso_ as mv

        worker_id = mv.worker_id()
    elif dist_type == 'mpi_reduce':
        from mpi4py import MPI
        mpi_communicator = MPI.COMM_WORLD
        worker_id = mpi_communicator.Get_rank()
        workers_cnt = mpi_communicator.Get_size()

        if nccl:
            nccl_comm = init_nccl_env(mpi_communicator)

    print 'Use {}, worker id: {}'.format('multiverso' if dist_type == 'mv' else 'mpi' if dist_recover_lr_iter else 'none', worker_id)
    sys.stdout.flush()

    # Set logging file
    set_logging_file('log/complete/e{}d{}_res{}_att{}_worker{}_task{}_{}.txt'.format(
        n_encoder_layers, n_decoder_layers, residual_enc, attention_layer_id,
        worker_id, task, time.strftime('%m-%d-%H-%M-%S'),
    ))

    log('''\
Start Time = {}
'''.format(
        time.strftime('%c'),
    ))

    # Model options: load and save
    message('Top options:')
    pprint(model_options)
    pprint(model_options, stream=get_logging_file())
    message('Done')
    sys.stdout.flush()

    #load_options(model_options, reload_, preload, src_vocab_map_file and tgt_vocab_map_file)
    check_options(model_options)
    model_options['cost_normalization'] = 1
    ada_alpha = 0.95
    if dist_type == 'mpi_reduce':
        model_options['cost_normalization'] = workers_cnt

    message('Model options:')
    pprint(model_options)
    pprint(model_options, stream=get_logging_file())
    message()

    print 'Loading data'
    log('\n\n\nStart to prepare data\n@Current Time = {}'.format(time.time()))
    sys.stdout.flush()

    dataset_src, dataset_tgt = datasets[0], datasets[1]

    if shuffle_data:
        text_iterator_list = [None for _ in range(10)]
        text_iterator = None
    else:
        text_iterator_list = None

        text_iterator = TextIterator(
            dataset_src, dataset_tgt,
            vocab_filenames[0], vocab_filenames[1],
            batch_size,n_words_src, n_words,maxlen
        )

    valid_iterator = TextIterator(
        valid_datasets[0], valid_datasets[1],
        vocab_filenames[0], vocab_filenames[1],
        valid_batch_size, n_words_src, n_words
    )

    small_train_iterator = TextIterator(
        small_train_datasets[0], small_train_datasets[1],
        vocab_filenames[0], vocab_filenames[1],
        valid_batch_size, n_words_src, n_words
    )

    print 'Building model'
    model = NMTModel(model_options)

    params = model.initializer.init_params()

    # Reload parameters
    if reload_ and os.path.exists(preload):
        print 'Reloading model parameters'
        load_params(preload, params, src_map_file = src_vocab_map_file, tgt_map_file = tgt_vocab_map_file)
    sys.stdout.flush()

    # Given embedding
    if given_embedding is not None:
        print 'Loading given embedding...',
        load_embedding(params, given_embedding)
        print 'Done'

    print_params(params)

    model.init_tparams(params)

    # Build model, stochastic_mode = 0(soft), 1(stochastic), 2(hard)
    trng, use_noise, stochastic_mode, hyper_param,\
        x, x_mask, y, y_mask, \
        opt_ret, \
        cost, test_cost, x_emb, stochastic_updates,_ = model.build_model()
    inps = [x, x_mask, y, y_mask]

    all_stochastic_updates = OrderedDictUpdates()
    for item1 in stochastic_updates:
        for item2 in item1:
            all_stochastic_updates.update(item2)

    print 'Building sampler'
    f_init, f_next = model.build_sampler(trng=trng, use_noise=use_noise, batch_mode=True, stochastic_mode=stochastic_mode, hyper_param=hyper_param)
    stochastic_mode.set_value(1)

    # before any regularizer
    print 'Building f_log_probs...',
    f_log_probs = theano.function(inps, cost, profile=profile, updates=all_stochastic_updates)
    print 'Done'
    sys.stdout.flush()
    test_cost = test_cost.mean() #FIXME: do not regularize test_cost here

    cost = cost.mean()

    cost = l2_regularization(cost, model.P, decay_c)

    cost = regularize_alpha_weights(cost, alpha_c, model_options, x_mask, y_mask, opt_ret)

    print 'Building f_cost...',
    f_cost = theano.function(inps, test_cost, profile=profile, updates=all_stochastic_updates)
    print 'Done'

    if plot_graph is not None:
        print 'Plotting post-compile graph...',
        theano.printing.pydotprint(
            f_cost,
            outfile='pictures/post_compile_{}'.format(plot_graph),
            var_with_name_simple=True,
        )
        print 'Done'

    print 'Computing gradient...',
    grads = tensor.grad(cost, wrt=itemlist(model.P))

    clip_shared = theano.shared(np.array(clip_c, dtype=fX), name='clip_shared')

    if dist_type != 'mpi_reduce': #build grads clip into computational graph
        grads, g2 = clip_grad_remove_nan(grads, clip_shared, model.P)
    else: #do the grads clip after gradients aggregation
        g2 = None

    # compile the optimizer, the actual computational graph is compiled here
    lr = tensor.scalar(name='lr')
    print 'Building optimizers...',

    given_imm_data = get_adadelta_imm_data(optimizer, given_imm, preload)

    if optimizer == 'adadelta':
        f_grad_shared, f_update, grads_shared, imm_shared = Optimizers[optimizer](
        lr, model.P, grads, inps, cost, g2=g2, given_imm_data=given_imm_data, alpha = ada_alpha, all_stochastic_updates=all_stochastic_updates)

    if optimizer == 'adam':
        f_grad_shared, f_update, grads_shared, imm_shared = Optimizers[optimizer](
            lr, model.P, grads, inps, cost, g2=g2, given_imm_data=given_imm_data,
            all_stochastic_updates=all_stochastic_updates)

    print 'Done'

    if dist_type == 'mpi_reduce':
        f_grads_clip = make_grads_clip_func(grads_shared = grads_shared, mt_tparams= model.P, clip_c_shared = clip_shared)

    print 'Optimization'
    log('Preparation Done\n@Current Time = {}'.format(time.time()))

    if dist_type == 'mv':
        mv.barrier()
    elif dist_type == 'mpi_reduce':
        #create receive buffers for mpi allreduce
        rec_grads = [np.zeros_like(p.get_value()) for p in model.P.itervalues()]

    estop = False
    history_errs = []
    best_bleu = -1.0
    best_valid_cost = 1e6
    best_p = None
    bad_counter = 0
    uidx = search_start_uidx(reload_, preload)

    epoch_n_batches = 0
    start_epoch = 0
    pass_batches = 0

    print 'worker', worker_id, 'uidx', uidx, 'l_rate', lrate, 'ada_alpha', ada_alpha, 'n_batches', epoch_n_batches, 'start_epoch', start_epoch, 'pass_batches', pass_batches

    start_uidx = uidx

    if dump_before_train:
        print 'Dumping before train...',
        saveto_uidx = '{}.iter{}.npz'.format(
            os.path.splitext(saveto)[0], uidx)
        np.savez(saveto_uidx, history_errs=history_errs,
                 uidx=uidx, **unzip(model.P))
        save_options(model_options, uidx, saveto)
        print 'Done'
        sys.stdout.flush()

    stochastic_mode.set_value(0)
    valid_cost = validation(valid_iterator, f_cost, use_noise)
    small_train_cost = validation(small_train_iterator, f_cost, use_noise)
    message('Soft Valid cost {:.5f} Small train cost {:.5f}'.format(valid_cost, small_train_cost))
    stochastic_mode.set_value(1)
    #new_bleu = translate_dev_get_bleu(model, f_init, f_next, trng, use_noise, 5, 1.0)
    #best_bleu = new_bleu
    #message('BLEU = {:.2f} at uidx {}'.format(new_bleu, uidx))
    sys.stdout.flush()
    
    commu_time_sum = 0.0
    cp_time_sum =0.0
    reduce_time_sum = 0.0

    start_time = time.time()
    finetune_cnt = 0

    for eidx in xrange(start_epoch, max_epochs):
        if shuffle_data:
            text_iterator = load_shuffle_text_iterator(
                eidx, worker_id, text_iterator_list,
                datasets, vocab_filenames, batch_size, maxlen, n_words_src, n_words
            )
        n_samples = 0
        if dist_type == 'mpi_reduce':
            mpi_communicator.Barrier()

        for i, (x, y) in enumerate(text_iterator):
            if eidx == start_epoch and i < pass_batches: #ignore the first several batches when reload
                continue
            n_samples += len(x)
            uidx += 1

            use_noise.set_value(1.)

            x, x_mask, y, y_mask = prepare_data(x, y, maxlen=maxlen)

            if x is None:
                print 'Minibatch with zero sample under length ', maxlen
                uidx -= 1
                continue

            effective_uidx = uidx - start_uidx
            ud_start = time.time()

            # compute cost, grads
            if dist_type != 'mpi_reduce':
                cost, g2_value = f_grad_shared(x, x_mask, y, y_mask)
            else:
                cost = f_grad_shared(x, x_mask, y, y_mask)

            if dist_type == 'mpi_reduce':
                reduce_start = time.time()
                commu_time = 0
                gpucpu_cp_time = 0
                if not nccl:
                    commu_time, gpucpu_cp_time = all_reduce_params(grads_shared, rec_grads)
                else:
                    commu_time, gpucpu_cp_time = all_reduce_params_nccl(nccl_comm, grads_shared)
                reduce_time = time.time() - reduce_start
                commu_time_sum += commu_time
                reduce_time_sum += reduce_time
                cp_time_sum += gpucpu_cp_time

                g2_value = f_grads_clip()
                print '@Worker = {}, Reduce time = {:.5f}, Commu time = {:.5f}, Copy time = {:.5f}'.format(worker_id, reduce_time, commu_time, gpucpu_cp_time)

            curr_lr = lrate if not dist_type or dist_recover_lr_iter < effective_uidx else lrate * 0.05 + effective_uidx * lrate / dist_recover_lr_iter * 0.95
            if curr_lr < lrate:
                print 'Curr lr {:.3f}'.format(curr_lr)

            # do the update on parameters
            f_update(curr_lr)

            ud = time.time() - ud_start

            if np.isnan(g2_value) or np.isinf(g2_value):
                message('gradient NaN detected')
                sys.stdout.flush()
                
            if np.isnan(cost) or np.isinf(cost):
                message('cost NaN detected')
                model.save_model(saveto, history_errs, uidx)
                save_minibatch(x, y, saveto, uidx, vocab_filenames)
                sys.stdout.flush()

                return 1., 1., 1.

            # discount learning rate
            # FIXME: Do NOT enable this and fine-tune at the same time
            if lr_discount_freq > 0 and np.mod(effective_uidx, lr_discount_freq) == 0:
                lrate *= 0.5
                message('Discount learning rate to {} at iteration {}'.format(lrate, uidx))

            # sync batch
            if dist_type == 'mv' and np.mod(uidx, dispFreq) == 0:
                comm_start = time.time()
                model.sync_tparams()
                message('@Comm time = {:.5f}'.format(time.time() - comm_start))

            # verbose
            if np.mod(effective_uidx, dispFreq) == 0:
                message('Worker {} Epoch {} Update {} Cost {:.5f} G2 {:.5f} UD {:.5f} Time {:.5f} s'.format(
                    worker_id, eidx, uidx, float(cost), float(g2_value), ud, time.time() - start_time,
                ))
                sys.stdout.flush()

            if np.mod(effective_uidx, saveFreq) == 0 and worker_id == 0:
                # save with uidx
                if not overwrite:
                    print 'Saving the model at iteration {}...'.format(uidx),
                    model.save_model(saveto, history_errs, uidx)
                    print 'Done'
                    sys.stdout.flush()

                # save immediate data in adadelta
                saveto_imm_path = '{}_latest.npz'.format(os.path.splitext(saveto)[0])
                dump_adadelta_imm_data(optimizer, imm_shared, dump_imm, saveto_imm_path)

            if np.mod(effective_uidx, validFreq) == 0:
                stochastic_mode.set_value(0)
                valid_cost = validation(valid_iterator, f_cost, use_noise)
                small_train_cost = validation(small_train_iterator, f_cost, use_noise)
                message('Soft Valid cost {:.5f} Small train cost {:.5f}'.format(valid_cost, small_train_cost))
                #new_bleu = translate_dev_get_bleu(model, f_init, f_next, trng, use_noise, 5, 1.0)
                #message('BLEU = {:.2f} at uidx {}'.format(new_bleu, uidx))
                sys.stdout.flush()

                #if new_bleu > best_bleu:
                #    print 'Saving the model at iteration {}...'.format(uidx),
                #    model.save_model(saveto, history_errs, uidx)
                #    print 'Done'
                #    best_bleu = new_bleu
                #    sys.stdout.flush()
                    
                stochastic_mode.set_value(1)


                # Fine-tune based on dev cost
                if fine_tune_patience > 0:
                    if valid_cost < best_valid_cost:
                        bad_counter = 0
                        best_valid_cost = valid_cost
                        #dump the best model so far, including the immediate file
                        if worker_id == 0:
                            message('Dump the the best model so far at uidx {}'.format(uidx))
                            model.save_model(saveto, history_errs)
                            #dump_adadelta_imm_data(optimizer, imm_shared, dump_imm, saveto)
                    else:
                        bad_counter += 1
                        if bad_counter >= fine_tune_patience:
                            print 'Fine tune:',
                            if finetune_cnt % 2 == 0:
                                lrate = np.float32(lrate * 0.5)
                                message('Discount learning rate to {} at iteration {}'.format(lrate, uidx))
                                if lrate <= 0.025:
                                    message('Learning rate decayed to {:.5f}, task completed'.format(lrate))
                                    return 1., 1., 1.
                            else:
                                clip_shared.set_value(np.float32(clip_shared.get_value() * 0.25))
                                message('Discount clip value to {} at iteration {}'.format(clip_shared.get_value(), uidx))
                            finetune_cnt += 1
                            bad_counter = 0


            # finish after this many updates
            if uidx >= finish_after:
                print 'Finishing after {} iterations!'.format(uidx)
                estop = True
                break

        print 'Seen {} samples'.format(n_samples)

        if estop:
            break

    if best_p is not None:
        zipp(best_p, model.P)

    use_noise.set_value(0.)

    return 0.
Ejemplo n.º 8
0
                             std=[x / 255 for x in [63.0, 62.1, 66.7]])
    ])),
                                          batch_size=args.batch_size,
                                          shuffle=False,
                                          **kwargs)

model = resnet.resnet20()
criterion = torch.nn.CrossEntropyLoss()

# if args.ngpu > 1:
#         model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu)))
if args.parallel:
    model = torchmodel.MVTorchModel(model)

if args.cuda:
    device = devs[0] if args.parallel else devs[mv.worker_id()]
    model.cuda(device)
    criterion.cuda(device)

optimizer = optim.SGD(model.parameters(),
                      lr=args.lr,
                      momentum=args.momentum,
                      weight_decay=args.decay)
scheduler = lr_scheduler.MultiStepLR(
    optimizer,
    milestones=[int(i) for i in args.lr_decay.split(',')],
    gamma=0.1)


def train(epoch):
    model.train()
 mode (default: False)", default=False)
parser.add_argument('-b', '--batch-size', type=int, help="batch size (default:\
 False)", default=128)
parser.add_argument('-e', '--epoches', type=int, help="Number of epoches(default:\
 82)", default=82)
args = parser.parse_args()
print(args)


# MULTIVERSO: import multiverso
import multiverso as mv

# MULTIVERSO: you should call mv.init before call multiverso apis
mv.init(sync=args.sync)
# MULTIVERSO: every process has distinct worker id
worker_id = mv.worker_id()
# MULTIVERSO: mv.workers_num will return the number of workers
workers_num = mv.workers_num()
# NOTICE: To use multiple gpus, we must set the environment before import theano.
if "THEANO_FLAGS" not in os.environ:
    os.environ["THEANO_FLAGS"] = 'floatX=float32,device=gpu%d,lib.cnmem=1' % worker_id

import numpy as np
import theano
import theano.tensor as T
import lasagne
from multiverso.theano_ext.lasagne_ext import param_manager

# for the larger networks (n>=9), we need to adjust pythons recursion limit
sys.setrecursionlimit(10000)
Ejemplo n.º 10
0
def sgd_optimization_mnist(learning_rate=0.13,
                           n_epochs=1000,
                           dataset='mnist.pkl.gz',
                           batch_size=600):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model

    This is demonstrated on MNIST.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    """
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # MULTIVERSO: you should call mv.init before call multiverso apis
    mv.init()
    # MULTIVERSO: every process has distinct worker id
    worker_id = mv.worker_id()

    # MULTIVERSO: mv.workers_num will return the number of workers
    total_worker = mv.workers_num()

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # Each MNIST image has size 28*28
    classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-3

    ###############
    # TRAIN MODEL #
    ###############
    print('... training the model')
    validation_frequency = n_train_batches
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):
            # MULTIVERSO: we distribute the batches to different workers.
            # A worker will only train batches belonged to itself
            if minibatch_index % total_worker == worker_id:
                minibatch_avg_cost = train_model(minibatch_index)
                # MULTIVERSO: when you want to commit all the delta of
                # parameters produced by mv_shared and update the latest
                # parameters from parameter server, you can call this function to
                # synchronize the values
                sharedvar.sync_all_mv_shared_vars()

            iter = (epoch - 1) * n_train_batches + minibatch_index

            # MULTIVERSO: only master worker will output the model
            if mv.is_master_worker() and (iter +
                                          1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       validation_loss * 100.))
        # MULTIVERSO: all the workers will synchronize at the place you call barrier
        mv.barrier()

    # MULTIVERSO: You should make sure only one process will output the result.
    # Otherwise results will be outputted repeatedly
    if mv.is_master_worker():
        end_time = timeit.default_timer()

        test_losses = [test_model(i) for i in range(n_test_batches)]
        test_score = numpy.mean(test_losses)

        print(('Optimization complete with validation score of %f %%,'
               'with test performance %f %%') %
              (validation_loss * 100., test_score * 100.))
        print('The code run for %d epochs, with %f epochs/sec' %
              (epoch, 1. * epoch / (end_time - start_time)))
        print(('The code for file ' + os.path.split(__file__)[1] +
               ' ran for %.1fs' % ((end_time - start_time))),
              file=sys.stderr)

        # save the model
        with open('model.pkl', 'wb') as f:
            pickle.dump(classifier, f)
    # MULTIVERSO: You must call shutdown at the end of the file
    mv.shutdown()