Exemple #1
0
def train_loop(args,
               logger,
               vocab,
               train_progs,
               infer_progs,
               optimizer,
               nccl2_num_trainers=1,
               nccl2_trainer_id=0,
               worker_endpoints=None):
    train_prog, train_startup_prog, train_model = train_progs
    infer_prog, infer_startup_prog, infer_model = infer_progs

    # prepare device
    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
    exe = Executor(place)
    if not args.use_gpu:
        place = fluid.CPUPlace()
        import multiprocessing
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    else:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()

    if args.load_dir:
        logger.info('load pretrained checkpoints from {}'.format(
            args.load_dir))
        fluid.io.load_persistables(exe, args.load_dir, main_program=train_prog)
    elif args.load_pretraining_params:
        logger.info('load pretrained params from {}'.format(
            args.load_pretraining_params))
        exe.run(train_startup_prog)
        init_pretraining_params(exe,
                                args.load_pretraining_params,
                                main_program=train_prog)
    else:
        exe.run(train_startup_prog)

    # prepare data
    feed_list = [
        train_prog.global_block().var(var_name)
        for var_name in train_model.feed_order
    ]
    feeder = fluid.DataFeeder(feed_list, place)

    logger.info('Training the model...')
    exe_strategy = fluid.parallel_executor.ExecutionStrategy()
    parallel_executor = fluid.ParallelExecutor(loss_name=train_model.loss.name,
                                               main_program=train_prog,
                                               use_cuda=bool(args.use_gpu),
                                               exec_strategy=exe_strategy,
                                               num_trainers=nccl2_num_trainers,
                                               trainer_id=nccl2_trainer_id)

    logger.info("begin to load data")
    train_data = data.BidirectionalLMDataset(args.train_path,
                                             vocab,
                                             test=(not args.shuffle),
                                             shuffle_on_load=args.shuffle)
    logger.info("finished load vocab")

    # get train epoch size
    log_interval = args.log_interval
    total_time = 0.0
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    custom_samples_array = np.zeros(
        (batch_size, args.num_steps, args.n_negative_samples_batch + 1),
        dtype='int64')
    custom_probabilities_array = np.zeros(
        (batch_size, args.num_steps, args.n_negative_samples_batch + 1),
        dtype='float32')
    for i in range(batch_size):
        for j in range(0, args.num_steps):
            for k in range(0, args.n_negative_samples_batch + 1):
                custom_samples_array[i][j][k] = k
                custom_probabilities_array[i][j][k] = 1.0

    start_time = time.time()
    train_data_iter = lambda: train_data.iter_batches(batch_size * dev_count,
                                                      args.num_steps)
    train_reader = read_multiple(train_data_iter, batch_size, dev_count)
    total_num = 0
    n_batch_loss = 0.0
    n_batch_cnt = 0
    last_hidden_values = np.zeros(
        (dev_count, args.num_layers * 2 * batch_size * args.embed_size),
        dtype='float32')
    last_cell_values = np.zeros(
        (dev_count, args.num_layers * 2 * batch_size * hidden_size),
        dtype='float32')
    n_tokens_per_batch = args.batch_size * args.num_steps
    n_batches_per_epoch = int(args.all_train_tokens / n_tokens_per_batch)
    n_batches_total = args.max_epoch * n_batches_per_epoch
    begin_time = time.time()
    for batch_id, batch_list in enumerate(train_reader(), 1):
        if batch_id > n_batches_total:
            break
        feed_data = batch_reader(batch_list, args)
        feed = list(feeder.feed_parallel(feed_data, dev_count))
        for i in range(dev_count):
            init_hidden_tensor = fluid.core.LoDTensor()
            if args.use_gpu:
                placex = fluid.CUDAPlace(i)
            else:
                placex = fluid.CPUPlace()
            init_hidden_tensor.set(last_hidden_values[i], placex)
            init_cell_tensor = fluid.core.LoDTensor()
            init_cell_tensor.set(last_cell_values[i], placex)

            feed[i]['init_hiddens'] = init_hidden_tensor
            feed[i]['init_cells'] = init_cell_tensor

        fetch_outs = parallel_executor.run(feed=feed,
                                           fetch_list=[
                                               train_model.loss.name,
                                               train_model.last_hidden.name,
                                               train_model.last_cell.name
                                           ],
                                           return_numpy=False)
        cost_train = np.array(fetch_outs[0]).mean()
        last_hidden_values = np.array(fetch_outs[1])
        last_hidden_values = last_hidden_values.reshape(
            (dev_count, args.num_layers * 2 * batch_size * args.embed_size))
        last_cell_values = np.array(fetch_outs[2])
        last_cell_values = last_cell_values.reshape(
            (dev_count, args.num_layers * 2 * batch_size * args.hidden_size))

        total_num += args.batch_size * dev_count
        n_batch_loss += np.array(fetch_outs[0]).sum()
        n_batch_cnt += len(np.array(fetch_outs[0]))

        if batch_id > 0 and batch_id % log_interval == 0:
            smoothed_ppl = np.exp(n_batch_loss / n_batch_cnt)
            ppl = np.exp(
                np.array(fetch_outs[0]).sum() / len(np.array(fetch_outs[0])))
            used_time = time.time() - begin_time
            speed = log_interval / used_time
            logger.info(
                "[train] step:{}, loss:{:.3f}, ppl:{:.3f}, smoothed_ppl:{:.3f}, speed:{:.3f}"
                .format(batch_id, n_batch_loss / n_batch_cnt, ppl,
                        smoothed_ppl, speed))
            n_batch_loss = 0.0
            n_batch_cnt = 0
            begin_time = time.time()
        if batch_id > 0 and batch_id % args.dev_interval == 0:
            valid_ppl = eval(vocab, infer_progs, dev_count, logger, args)
            logger.info("valid ppl {}".format(valid_ppl))
        if batch_id > 0 and batch_id % args.save_interval == 0:
            model_path = os.path.join(args.para_save_dir,
                                      str(batch_id + epoch_id))
            if not os.path.isdir(model_path):
                os.makedirs(model_path)
            fluid.io.save_persistables(executor=exe,
                                       dirname=model_path,
                                       main_program=train_prog)

    end_time = time.time()
    total_time += end_time - start_time
    epoch_id = int(batch_id / n_batches_per_epoch)
    model_path = os.path.join(args.para_save_dir, str(epoch_id))
    if not os.path.isdir(model_path):
        os.makedirs(model_path)
    fluid.io.save_persistables(executor=exe,
                               dirname=model_path,
                               main_program=train_prog)
    valid_ppl = eval(vocab, infer_progs, dev_count, logger, args)
    logger.info("valid ppl {}".format(valid_ppl))
    test_ppl = eval(vocab, infer_progs, dev_count, logger, args)
Exemple #2
0
def eval(vocab, infer_progs, dev_count, logger, args):
    infer_prog, infer_startup_prog, infer_model = infer_progs
    feed_order = infer_model.feed_order
    loss = infer_model.loss

    # prepare device
    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
    exe = Executor(place)
    if not args.use_gpu:
        place = fluid.CPUPlace()
        import multiprocessing
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    else:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()

    total_loss = 0.0
    total_cnt = 0
    n_batch_cnt = 0
    n_batch_loss = 0.0
    val_feed_list = [
        infer_prog.global_block().var(var_name) for var_name in feed_order
    ]
    val_feeder = fluid.DataFeeder(val_feed_list, place)
    dev_data = data.BidirectionalLMDataset(args.test_path,
                                           vocab,
                                           test=True,
                                           shuffle_on_load=False)
    dev_data_iter = lambda: dev_data.iter_batches(args.batch_size * dev_count,
                                                  args.num_steps)
    dev_reader = read_multiple(dev_data_iter, args.batch_size, dev_count)

    last_hidden_values = np.zeros(
        (dev_count, args.num_layers * 2 * args.batch_size * args.embed_size),
        dtype='float32')
    last_cell_values = np.zeros(
        (dev_count, args.num_layers * 2 * args.batch_size * args.hidden_size),
        dtype='float32')
    for batch_id, batch_list in enumerate(dev_reader(), 1):
        feed_data = batch_reader(batch_list, args)
        feed = list(val_feeder.feed_parallel(feed_data, dev_count))
        for i in range(dev_count):
            init_hidden_tensor = fluid.core.LoDTensor()
            if args.use_gpu:
                placex = fluid.CUDAPlace(i)
            else:
                placex = fluid.CPUPlace()
            init_hidden_tensor.set(last_hidden_values[i], placex)
            init_cell_tensor = fluid.core.LoDTensor()
            init_cell_tensor.set(last_cell_values[i], placex)

            feed[i]['init_hiddens'] = init_hidden_tensor
            feed[i]['init_cells'] = init_cell_tensor
        last_hidden_values = []
        last_cell_values = []
        for i in range(dev_count):
            val_fetch_outs = exe.run(program=infer_prog,
                                     feed=feed[i],
                                     fetch_list=[
                                         infer_model.loss.name,
                                         infer_model.last_hidden.name,
                                         infer_model.last_cell.name
                                     ],
                                     return_numpy=False)
            last_hidden_values.append(np.array(val_fetch_outs[1]))
            last_cell_values.append(np.array(val_fetch_outs[2]))
            total_loss += np.array(val_fetch_outs[0]).sum()

            n_batch_cnt += len(np.array(val_fetch_outs[0]))
            total_cnt += len(np.array(val_fetch_outs[0]))
            n_batch_loss += np.array(val_fetch_outs[0]).sum()

        last_hidden_values = np.array(last_hidden_values).reshape(
            (dev_count,
             args.num_layers * 2 * args.batch_size * args.embed_size))
        last_cell_values = np.array(last_cell_values).reshape(
            (dev_count,
             args.num_layers * 2 * args.batch_size * args.hidden_size))

        log_every_n_batch = args.log_interval
        if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0:
            logger.info('Average dev loss from batch {} to {} is {}'.format(
                batch_id - log_every_n_batch + 1, batch_id,
                "%.10f" % (n_batch_loss / n_batch_cnt)))
            n_batch_loss = 0.0
            n_batch_cnt = 0
        batch_offset = 0

    ppl = np.exp(total_loss / total_cnt)
    return ppl