Ejemplo n.º 1
0
 def test_w_is_selected_rows(self):
     place = core.CUDAPlace(0)
     if core.is_float16_supported(place):
         for inplace in [True, False]:
             self.check_with_place(place, inplace)
Ejemplo n.º 2
0
 def test_check_output(self):
     if core.is_compiled_with_cuda():
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
             self.check_output_with_place(place, atol=1e-1)
Ejemplo n.º 3
0
 def test_check_output(self):
     place = core.CUDAPlace(0)
     if core.is_float16_supported(place):
         self.check_output_with_place(place)
Ejemplo n.º 4
0
 def test_check_output(self):
     place = core.CUDAPlace(0)
     self.check_output_with_place(place, check_eager=False)
Ejemplo n.º 5
0
 def test_check_grad_ingore_y(self):
     place = core.CUDAPlace(0)
     self.check_grad_with_place(
         place, ['X'], 'Out', no_grad_set=set('Y'), check_eager=False)
Ejemplo n.º 6
0
def main():
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
            data_shape = [3, 32, 32]
        else:
            data_shape = [32, 32, 3]
    else:
        classdim = 102
        if args.data_format == 'NCHW':
            data_shape = [3, 224, 224]
        else:
            data_shape = [224, 224, 3]

    # Input data
    data_file = fluid.layers.open_recordio_file(filename='./train.recordio',
                                                shapes=[[-1, 3, 224, 224],
                                                        [-1, 1]],
                                                lod_levels=[0, 0],
                                                dtypes=['float32', 'int64'])
    data_file = fluid.layers.create_double_buffer_reader(reader=data_file,
                                                         place='CUDA:0')
    images, label = fluid.layers.read_file(data_file)

    # Train program
    net = vgg16_bn_drop(images)
    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)

    # Evaluator
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(input=predict,
                                      label=label,
                                      total=batch_size_tensor)

    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])

    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
    opts = optimizer.minimize(avg_cost)

    fluid.memory_optimize(fluid.default_main_program())

    # Initialize executor
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = fluid.Executor(place)

    # Parameter initialization
    exe.run(fluid.default_startup_program())

    iters = 0
    accuracy = fluid.average.WeightedAverage()
    for pass_id in range(args.num_passes):
        # train
        start_time = time.time()
        num_samples = 0
        accuracy.reset()
        while not data_file.eof():
            loss, acc, weight = exe.run(
                fluid.default_main_program(),
                fetch_list=[avg_cost, batch_acc, batch_size_tensor])
            accuracy.add(value=acc, weight=weight)
            iters += 1
            num_samples += args.batch_size
            print(
                "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss, acc)
            )  # The accuracy is the accumulation of batches, but not the current batch.

        pass_elapsed = time.time() - start_time
        pass_train_acc = accuracy.eval()
        print(
            "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f\n"
            % (pass_id, num_samples / pass_elapsed, pass_train_acc))
Ejemplo n.º 7
0
    def test_run(self):
        x = layers.data(name='x',
                        shape=[-1, self.batch_size, self.hidden_size],
                        dtype='float32')
        sequence_length = layers.data(name="sequence_length",
                                      shape=[-1],
                                      dtype='float32')

        rnn_out, last_hidden = basic_gru( x, None, self.hidden_size, num_layers=self.num_layers, \
                batch_first = self.batch_first, bidirectional=self.is_bidirect, sequence_length=sequence_length )

        last_hidden.persisbale = True
        rnn_out.persisbale = True

        if core.is_compiled_with_cuda():
            place = core.CUDAPlace(0)
        else:
            place = core.CPUPlace()

        exe = Executor(place)
        exe.run(framework.default_startup_program())

        param_list = fluid.default_main_program().block(0).all_parameters()

        # process weight and bias
        gate_weight = []
        gate_bias = []
        candidate_weight = []
        candidate_bias = []

        for i in range(self.num_layers):
            gate_w_name = "basic_gru_layers_" + str(i) + "/BasicGRUUnit_0.w_0"
            gate_b_name = "basic_gru_layers_" + str(i) + "/BasicGRUUnit_0.b_0"
            candidate_w_name = "basic_gru_layers_" + str(
                i) + "/BasicGRUUnit_0.w_1"
            candidate_b_name = "basic_gru_layers_" + str(
                i) + "/BasicGRUUnit_0.b_1"

            gate_w = np.array(
                fluid.global_scope().find_var(gate_w_name).get_tensor())
            gate_w = np.random.uniform(-0.1, 0.1,
                                       size=gate_w.shape).astype('float32')
            fluid.global_scope().find_var(gate_w_name).get_tensor().set(
                gate_w, place)

            gate_b = np.array(
                fluid.global_scope().find_var(gate_b_name).get_tensor())
            gate_b = np.random.uniform(-0.1, 0.1,
                                       size=gate_b.shape).astype('float32')
            fluid.global_scope().find_var(gate_b_name).get_tensor().set(
                gate_b, place)

            candidate_w = np.array(
                fluid.global_scope().find_var(candidate_w_name).get_tensor())
            candidate_w = np.random.uniform(
                -0.1, 0.1, size=candidate_w.shape).astype('float32')
            fluid.global_scope().find_var(candidate_w_name).get_tensor().set(
                candidate_w, place)

            candidate_b = np.array(
                fluid.global_scope().find_var(candidate_b_name).get_tensor())
            candidate_b = np.random.uniform(
                -0.1, 0.1, size=candidate_b.shape).astype('float32')
            fluid.global_scope().find_var(candidate_b_name).get_tensor().set(
                candidate_b, place)

            gate_weight.append(gate_w)
            gate_bias.append(gate_b)
            candidate_weight.append(candidate_w)
            candidate_bias.append(candidate_b)

        if self.is_bidirect:
            for i in range(self.num_layers):
                gate_w_name = "basic_gru_reverse_layers_" + str(
                    i) + "/BasicGRUUnit_0.w_0"
                gate_b_name = "basic_gru_reverse_layers_" + str(
                    i) + "/BasicGRUUnit_0.b_0"
                candidate_w_name = "basic_gru_reverse_layers_" + str(
                    i) + "/BasicGRUUnit_0.w_1"
                candidate_b_name = "basic_gru_reverse_layers_" + str(
                    i) + "/BasicGRUUnit_0.b_1"

                gate_w = np.array(
                    fluid.global_scope().find_var(gate_w_name).get_tensor())
                gate_w = np.random.uniform(-0.1, 0.1,
                                           size=gate_w.shape).astype('float32')
                fluid.global_scope().find_var(gate_w_name).get_tensor().set(
                    gate_w, place)

                gate_b = np.array(
                    fluid.global_scope().find_var(gate_b_name).get_tensor())
                gate_b = np.random.uniform(-0.1, 0.1,
                                           size=gate_b.shape).astype('float32')
                fluid.global_scope().find_var(gate_b_name).get_tensor().set(
                    gate_b, place)

                candidate_w = np.array(fluid.global_scope().find_var(
                    candidate_w_name).get_tensor())
                candidate_w = np.random.uniform(
                    -0.1, 0.1, size=candidate_w.shape).astype('float32')
                fluid.global_scope().find_var(
                    candidate_w_name).get_tensor().set(candidate_w, place)

                candidate_b = np.array(fluid.global_scope().find_var(
                    candidate_b_name).get_tensor())
                candidate_b = np.random.uniform(
                    -0.1, 0.1, size=candidate_b.shape).astype('float32')
                fluid.global_scope().find_var(
                    candidate_b_name).get_tensor().set(candidate_b, place)

                gate_weight.append(gate_w)
                gate_bias.append(gate_b)
                candidate_weight.append(candidate_w)
                candidate_bias.append(candidate_b)

        step_input_np = np.random.uniform(-0.1, 0.1,
                                          (self.seq_len, self.batch_size,
                                           self.hidden_size)).astype('float32')
        sequence_length_np = np.random.randint(
            self.seq_len // 2, self.seq_len,
            size=(self.batch_size)).astype('int64')

        out = exe.run(feed={
            'x': step_input_np,
            'sequence_length': sequence_length_np
        },
                      fetch_list=[rnn_out, last_hidden])

        api_rnn_out = out[0]
        api_last_hidden = out[1]

        np_out = gru_np(step_input_np,
                        None,
                        self.hidden_size,
                        gate_weight,
                        gate_bias,
                        candidate_weight,
                        candidate_bias,
                        num_layers=self.num_layers,
                        batch_first=self.batch_first,
                        is_bidirect=self.is_bidirect,
                        sequence_length=sequence_length_np)

        self.assertTrue(np.allclose(api_rnn_out, np_out[0], rtol=1e-4, atol=0))

        self.assertTrue(
            np.allclose(api_last_hidden, np_out[1], rtol=1e-4, atol=0))
Ejemplo n.º 8
0
def train(logger, args):
    """train a model"""
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        if six.PY2:
            vocab = pickle.load(fin)
        else:
            vocab = pickle.load(fin, encoding='bytes')
        logger.info('vocab size is {} and embed dim is {}'.format(
            vocab.size(), vocab.embed_dim))
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.trainset, args.devset)  #brc_data
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')

    if not args.use_gpu:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    else:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()

    # build model
    main_program = fluid.Program()
    startup_prog = fluid.Program()
    if args.enable_ce:
        main_program.random_seed = args.random_seed
        startup_prog.random_seed = args.random_seed
    with fluid.program_guard(main_program, startup_prog):
        with fluid.unique_name.guard():
            avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model(
                args.hidden_size, vocab, args)
            # clone from default main program and use it as the validation program
            inference_program = main_program.clone(for_test=True)

            # build optimizer
            if args.optim == 'sgd':
                optimizer = fluid.optimizer.SGD(
                    learning_rate=args.learning_rate)
            elif args.optim == 'adam':
                optimizer = fluid.optimizer.Adam(
                    learning_rate=args.learning_rate)
            elif args.optim == 'rprop':
                optimizer = fluid.optimizer.RMSPropOptimizer(
                    learning_rate=args.learning_rate)
            else:
                logger.error('Unsupported optimizer: {}'.format(args.optim))
                exit(-1)
            if args.weight_decay > 0.0:
                obj_func = avg_cost + args.weight_decay * l2_loss(main_program)
                optimizer.minimize(obj_func)
            else:
                obj_func = avg_cost
                optimizer.minimize(obj_func)

            # initialize parameters
            place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
            exe = Executor(place)
            if args.load_dir:
                logger.info('load from {}'.format(args.load_dir))
                fluid.io.load_persistables(exe,
                                           args.load_dir,
                                           main_program=main_program)
            else:
                exe.run(startup_prog)
                embedding_para = fluid.global_scope().find_var(
                    'embedding_para').get_tensor()
                embedding_para.set(vocab.embeddings.astype(np.float32), place)

            # prepare data
            feed_list = [
                main_program.global_block().var(var_name)
                for var_name in feed_order
            ]
            feeder = fluid.DataFeeder(feed_list, place)

            logger.info('Training the model...')
            parallel_executor = fluid.ParallelExecutor(
                main_program=main_program,
                use_cuda=bool(args.use_gpu),
                loss_name=avg_cost.name)
            print_para(main_program, parallel_executor, logger, args)

            for pass_id in range(1, args.pass_num + 1):
                pass_start_time = time.time()
                pad_id = vocab.get_id(vocab.pad_token)
                if args.enable_ce:  #这里是关键   train_reader初始化
                    train_reader = lambda: brc_data.gen_mini_batches(
                        'train', args.batch_size, pad_id, shuffle=False)
                else:
                    train_reader = lambda: brc_data.gen_mini_batches(
                        'train', args.batch_size, pad_id, shuffle=True)
                train_reader = read_multiple(train_reader, dev_count)
                log_every_n_batch, n_batch_loss = args.log_interval, 0
                total_num, total_loss = 0, 0  #total_num初始化
                for batch_id, batch_list in enumerate(train_reader(), 1):
                    feed_data = batch_reader(batch_list, args)
                    fetch_outs = parallel_executor.run(
                        feed=list(feeder.feed_parallel(feed_data, dev_count)),
                        fetch_list=[obj_func.name],
                        return_numpy=False)
                    cost_train = np.array(fetch_outs[0]).mean()
                    total_num += args.batch_size * dev_count
                    n_batch_loss += cost_train
                    total_loss += cost_train * args.batch_size * dev_count

                    if args.enable_ce and batch_id >= 100:
                        break
                    if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0:
                        print_para(main_program, parallel_executor, logger,
                                   args)
                        logger.info(
                            'Average loss from batch {} to {} is {}'.format(
                                batch_id - log_every_n_batch + 1, batch_id,
                                "%.10f" % (n_batch_loss / log_every_n_batch)))
                        n_batch_loss = 0
                    if args.dev_interval > 0 and batch_id % args.dev_interval == 0:
                        if brc_data.dev_set is not None:
                            eval_loss, bleu_rouge = validation(
                                inference_program, avg_cost, s_probs, e_probs,
                                match, feed_order, place, dev_count, vocab,
                                brc_data, logger, args)
                            logger.info(
                                'Dev eval result: {}'.format(bleu_rouge))
                pass_end_time = time.time()
                time_consumed = pass_end_time - pass_start_time
                logger.info('epoch: {0}, epoch_time_cost: {1:.2f}'.format(
                    pass_id, time_consumed))
                logger.info(
                    'Evaluating the model after epoch {}'.format(pass_id))
                if brc_data.dev_set is not None:
                    eval_loss, bleu_rouge = validation(inference_program,
                                                       avg_cost, s_probs,
                                                       e_probs, match,
                                                       feed_order, place,
                                                       dev_count, vocab,
                                                       brc_data, logger, args)
                    logger.info('Dev eval result: {}'.format(bleu_rouge))
                else:
                    logger.warning(
                        'No dev set is loaded for evaluation in the dataset!')
                logger.info('total_num = %s' % total_num)
                logger.info('Average train loss for epoch {} is {}'.format(
                    pass_id, "%.10f" % (1.0 * total_loss / total_num)))

                if pass_id % args.save_interval == 0:
                    model_path = os.path.join(args.save_dir, str(pass_id))
                    if not os.path.isdir(model_path):
                        os.makedirs(model_path)

                    fluid.io.save_persistables(executor=exe,
                                               dirname=model_path,
                                               main_program=main_program)
                if args.enable_ce:  # For CE
                    print("kpis\ttrain_cost_card%d\t%f" %
                          (dev_count, total_loss / total_num))
                    if brc_data.dev_set is not None:
                        print("kpis\ttest_cost_card%d\t%f" %
                              (dev_count, eval_loss))
                    print("kpis\ttrain_duration_card%d\t%f" %
                          (dev_count, time_consumed))
Ejemplo n.º 9
0
 def test_checkout_grad(self):
     place = core.CUDAPlace(0)
     if core.is_float16_supported(place):
         self.check_grad_with_place(place, ['X'],
                                    'Out',
                                    max_relative_error=0.8)
Ejemplo n.º 10
0
def train(args, data_reader=ctc_reader):
    """OCR CTC training"""
    num_classes = None
    train_images = None
    train_list = None
    test_images = None
    test_list = None
    num_classes = data_reader.num_classes(
    ) if num_classes is None else num_classes
    data_shape = data_reader.data_shape()
    # define network
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(name='label',
                              shape=[1],
                              dtype='int32',
                              lod_level=1)
    sum_cost, error_evaluator, inference_program, model_average = ctc_train_net(
        images, label, args, num_classes)

    # data reader
    train_reader = data_reader.train(args.batch_size,
                                     train_images_dir=train_images,
                                     train_list_file=train_list)
    test_reader = data_reader.test(args.batch_size,
                                   test_images_dir=test_images,
                                   test_list_file=test_list)

    # prepare environment
    place = fluid.CPUPlace()
    if args.use_gpu:
        place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    # load init model
    if args.init_model is not None:
        model_dir = args.init_model
        model_file_name = None
        if not os.path.isdir(args.init_model):
            model_dir = os.path.dirname(args.init_model)
            model_file_name = os.path.basename(args.init_model)
        fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name)
        print "Init model from: %s." % args.init_model

    fetch_vars = [sum_cost]
    fetch_vars.extend([e for e in error_evaluator])

    def test_parallel(exe, pass_id, batch_id):
        distance_evaluator = fluid.metrics.EditDistance(None)
        test_fetch = [v.name for v in error_evaluator]

        distance_evaluator.reset()
        for idx, data in enumerate(test_reader()):
            test_ret = exe.run(test_fetch, feed=get_feeder_data(data, place))
            distance_evaluator.update(distances=test_ret[0],
                                      seq_num=np.mean(test_ret[1]))
        return distance_evaluator.eval()

    def test(exe, pass_id):
        distance_evaluator = fluid.metrics.EditDistance(None)
        test_fetch = [v.name for v in error_evaluator]

        distance_evaluator.reset()
        for idx, data in enumerate(test_reader()):
            test_ret = exe.run(inference_program,
                               feed=get_feeder_data(data, place),
                               fetch_list=test_fetch)
            distance_evaluator.update(distances=test_ret[0],
                                      seq_num=np.mean(test_ret[1]))
        return distance_evaluator.eval()

    def train_parallel(train_exe):
        var_names = [var.name for var in fetch_vars]
        #test_exe = fluid.ParallelExecutor(
        #    use_cuda=True, main_program=inference_program, share_vars_from=train_exe)
        place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
        test_exe = fluid.Executor(place)

        for pass_id in range(args.pass_num):
            batch_id = 1
            total_loss = 0.0
            total_seq_error = 0.0
            # train a pass
            num_samples, start_time = 0, time.time()
            for idx, data in enumerate(train_reader()):
                batch_start_time = time.time()
                results = train_exe.run(var_names,
                                        feed=get_feeder_data(data, place))
                results = [np.array(result).sum() for result in results]
                total_loss += results[0]
                total_seq_error += results[1]
                # training log
                if batch_id % args.log_period == 0:
                    print(
                        "Pass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq err: %s; Speed: %.5f samples/sec"
                        % (pass_id, batch_id, total_loss /
                           (batch_id * args.batch_size), total_seq_error /
                           (batch_id * args.batch_size), len(data) /
                           (time.time() - batch_start_time)))
                batch_id += 1
                num_samples += len(data)

            print_train_time(start_time, time.time(), num_samples)
            # run test
            if model_average:
                with model_average.apply(test_exe):
                    #test_ret = test_parallel(test_exe, pass_id, batch_id)
                    test_ret = test(test_exe, pass_id)
            else:
                #test_ret = test_parallel(test_exe, pass_id, batch_id)
                test_ret = test(test_exe, pass_id)
            print("Pass[%d]; Test avg seq error: %s\n" %
                  (pass_id, test_ret[1]))

    if args.local:
        place = core.CPUPlace() if args.use_gpu else core.CUDAPlace(0)
        startup_exe = fluid.Executor(place)
        startup_exe.run(fluid.default_startup_program())
        exec_strategy = ExecutionStrategy()
        exec_strategy.use_cuda = args.use_gpu
        train_exe = fluid.ParallelExecutor(
            use_cuda=args.use_gpu,
            main_program=fluid.default_main_program(),
            loss_name=sum_cost.name,
            exec_strategy=exec_strategy)
        train_parallel(train_exe)
    else:
        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")
        trainers = int(os.getenv("PADDLE_TRAINERS"))
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)
        # the IP of the local machine, needed by pserver only
        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
        # the role, should be either PSERVER or TRAINER
        training_role = os.getenv("PADDLE_TRAINING_ROLE")
        t = distribute_transpiler.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
        if training_role == "PSERVER":
            pserver_program = t.get_pserver_program(current_endpoint)
            pserver_startup_program = t.get_startup_program(
                current_endpoint, pserver_program)
            exe = fluid.Executor(core.CPUPlace())
            exe.run(pserver_startup_program)
            exe.run(pserver_program)
        elif training_role == "TRAINER":
            exe.run(fluid.default_startup_program())
            trainer_program = t.get_trainer_program()
            exec_strategy = ExecutionStrategy()
            exec_strategy.use_cuda = args.use_gpu
            exec_strategy.num_threads = 1
            train_exe = fluid.ParallelExecutor(use_cuda=args.use_gpu,
                                               main_program=trainer_program,
                                               loss_name=sum_cost.name,
                                               exec_strategy=exec_strategy)
            train_parallel(train_exe)
        else:
            raise ValueError(
                "env PADDLE_TRAINING_ROLE should be in [PSERVER, TRIANER]")
Ejemplo n.º 11
0
 def test_check_output(self):
     if core.is_compiled_with_cuda() and core.op_support_gpu("dropout"):
         self.check_output_with_place(core.CUDAPlace(0), atol=1e-3)
Ejemplo n.º 12
0
 def test_cuda_place(self):
     if not core.is_compiled_with_cuda():
         return
     place = core.CUDAPlace(0)
     self.check_momentum_step(place)
     self.check_sgd_step(place)
Ejemplo n.º 13
0
 def test_sparse_sgd(self):
     places = [core.CPUPlace()]
     if core.is_compiled_with_cuda():
         places.append(core.CUDAPlace(0))
     for place in places:
         self.check_with_place(place)
 def setUp(self):
     self.scope = core.Scope()
     self.place = core.CUDAPlace(0)
Ejemplo n.º 15
0
 def test_check_output(self):
     # TODO(wangzhongpu): support mkldnn op in dygraph mode
     place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
     self.check_output_with_place(place,
                                  atol=1e-5,
                                  check_dygraph=(self.use_mkldnn == False))
Ejemplo n.º 16
0
 def test_check_output(self):
     if self.has_cuda():
         place = core.CUDAPlace(0)
         self.check_output_with_place(place, atol=1e-5)
     else:
         pass
Ejemplo n.º 17
0
 def test_check_output(self):
     place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace()
     self.check_output_with_place(place, atol=1e-5)
Ejemplo n.º 18
0
    def test_run(self):
        inputs_basic_lstm = fluid.data(
            name='inputs_basic_lstm',
            shape=[None, None, self.input_size],
            dtype='float32')
        sequence_length = fluid.data(
            name="sequence_length", shape=[None], dtype='int64')

        inputs_dynamic_rnn = layers.transpose(inputs_basic_lstm, perm=[1, 0, 2])
        cell = LSTMCell(self.hidden_size, name="LSTMCell_for_rnn")
        output, final_state = dynamic_rnn(
            cell=cell,
            inputs=inputs_dynamic_rnn,
            sequence_length=sequence_length,
            is_reverse=False)
        output_new = layers.transpose(output, perm=[1, 0, 2])

        rnn_out, last_hidden, last_cell = basic_lstm(inputs_basic_lstm, None, None, self.hidden_size, num_layers=1, \
                batch_first = False, bidirectional=False, sequence_length=sequence_length, forget_bias = 1.0)

        if core.is_compiled_with_cuda():
            place = core.CUDAPlace(0)
        else:
            place = core.CPUPlace()
        exe = Executor(place)
        exe.run(framework.default_startup_program())

        inputs_basic_lstm_np = np.random.uniform(
            -0.1, 0.1,
            (self.seq_len, self.batch_size, self.input_size)).astype('float32')
        sequence_length_np = np.ones(
            self.batch_size, dtype='int64') * self.seq_len

        inputs_np = np.random.uniform(
            -0.1, 0.1, (self.batch_size, self.input_size)).astype('float32')
        pre_hidden_np = np.random.uniform(
            -0.1, 0.1, (self.batch_size, self.hidden_size)).astype('float32')
        pre_cell_np = np.random.uniform(
            -0.1, 0.1, (self.batch_size, self.hidden_size)).astype('float32')

        param_names = [[
            "LSTMCell_for_rnn/BasicLSTMUnit_0.w_0",
            "basic_lstm_layers_0/BasicLSTMUnit_0.w_0"
        ], [
            "LSTMCell_for_rnn/BasicLSTMUnit_0.b_0",
            "basic_lstm_layers_0/BasicLSTMUnit_0.b_0"
        ]]

        for names in param_names:
            param = np.array(fluid.global_scope().find_var(names[0]).get_tensor(
            ))
            param = np.random.uniform(
                -0.1, 0.1, size=param.shape).astype('float32')
            fluid.global_scope().find_var(names[0]).get_tensor().set(param,
                                                                     place)
            fluid.global_scope().find_var(names[1]).get_tensor().set(param,
                                                                     place)

        out = exe.run(feed={
            'inputs_basic_lstm': inputs_basic_lstm_np,
            'sequence_length': sequence_length_np,
            'inputs': inputs_np,
            'pre_hidden': pre_hidden_np,
            'pre_cell': pre_cell_np
        },
                      fetch_list=[output_new, rnn_out])

        self.assertTrue(np.allclose(out[0], out[1], rtol=1e-4))
Ejemplo n.º 19
0
def train_parallel(train_args, test_args, args, train_prog, test_prog,
                   startup_prog, nccl_id_var, num_trainers, trainer_id):
    over_all_start = time.time()
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    feeder = None
    if not args.use_reader_op:
        feed_var_list = [
            var for var in train_prog.global_block().vars.itervalues()
            if var.is_data
        ]
        feeder = fluid.DataFeeder(feed_var_list, place)
    # generate fake:
    if args.use_fake_data:
        for var in feed_var_list:
            v = startup_prog.global_block()._clone_variable(var)
            var.persistable = True
            v.persistable = True

            real_shape = list(var.shape)
            real_shape[0] = args.batch_size / args.gpus
            startup_prog.global_block().append_op(outputs={"Out": v},
                                                  type="fill_constant",
                                                  attrs={
                                                      "shape": real_shape,
                                                      "value": 1.0,
                                                      "dtype": var.dtype
                                                  })

    if nccl_id_var and trainer_id == 0:
        #FIXME(wuyi): wait other trainer to start listening
        time.sleep(30)

    startup_exe = fluid.Executor(place)
    startup_exe.run(startup_prog)
    strategy = fluid.ExecutionStrategy()
    strategy.num_threads = args.cpus
    strategy.allow_op_delay = False
    build_strategy = fluid.BuildStrategy()
    if args.reduce_strategy == "reduce":
        build_strategy.reduce_strategy = fluid.BuildStrategy(
        ).ReduceStrategy.Reduce
    else:
        build_strategy.reduce_strategy = fluid.BuildStrategy(
        ).ReduceStrategy.AllReduce
    build_strategy.fuse_broadcast_op = args.fuse_broadcast_op

    avg_loss = train_args[0]

    if args.update_method == "pserver":
        # parameter server mode distributed training, merge
        # gradients on local server, do not initialize
        # ParallelExecutor with multi server all-reduce mode.
        num_trainers = 1
        trainer_id = 0

    exe = fluid.ParallelExecutor(True,
                                 avg_loss.name,
                                 main_program=train_prog,
                                 exec_strategy=strategy,
                                 build_strategy=build_strategy,
                                 num_trainers=num_trainers,
                                 trainer_id=trainer_id)

    if not args.no_test:
        if args.update_method == "pserver":
            test_scope = None
        else:
            # NOTE: use an empty scope to avoid test exe using NCCLID
            test_scope = fluid.Scope()
        test_exe = fluid.ParallelExecutor(True,
                                          main_program=test_prog,
                                          share_vars_from=exe)

    for pass_id in range(args.pass_num):
        num_samples = 0
        iters = 0
        start_time = time.time()
        if not args.use_reader_op:
            reader_generator = train_args[3]()  #train_reader
        batch_id = 0
        data = None
        if args.use_reader_op:
            train_args[4].start()
        while True:
            if not args.use_reader_op:
                data = next(reader_generator, None)
                if data == None:
                    break
            if args.profile and batch_id == 5:
                profiler.start_profiler("All")
                profiler.reset_profiler()
            elif args.profile and batch_id == 10:
                print("profiling total time: ", time.time() - start_time)
                profiler.stop_profiler(
                    "total", "/tmp/profile_%d_pass%d" % (trainer_id, pass_id))
            if iters == args.iterations:
                reader_generator.close()
                break

            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            fetch_list = [avg_loss.name]
            acc_name_list = [v.name for v in train_args[2]]
            fetch_list.extend(acc_name_list)

            if args.use_fake_data or args.use_reader_op:
                try:
                    fetch_ret = exe.run(fetch_list)
                except fluid.core.EOFException as eof:
                    break
                except fluid.core.EnforceNotMet as ex:
                    traceback.print_exc()
                    break
            else:
                fetch_ret = exe.run(fetch_list, feed=feeder.feed(data))
            if args.use_reader_op:
                num_samples += args.batch_size * args.gpus
            else:
                num_samples += len(data)

            iters += 1
            if batch_id % 1 == 0:
                fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
                print("Pass %d, batch %d, loss %s, accucacys: %s" %
                      (pass_id, batch_id, fetched_data[0], fetched_data[1:]))
            batch_id += 1

        print_train_time(start_time, time.time(), num_samples)
        if args.use_reader_op:
            train_args[4].reset()  # reset reader handle
        else:
            del reader_generator

        if not args.no_test and test_args[2]:
            test_feeder = None
            if not args.use_reader_op:
                test_feed_var_list = [
                    var for var in test_prog.global_block().vars.itervalues()
                    if var.is_data
                ]
                test_feeder = fluid.DataFeeder(test_feed_var_list, place)
            test_ret = test_parallel(test_exe, test_args, args, test_prog,
                                     test_feeder)
            print("Pass: %d, Test Accuracy: %s\n" %
                  (pass_id, [np.mean(np.array(v)) for v in test_ret]))

    print("total train time: ", time.time() - over_all_start)
Ejemplo n.º 20
0
 def test_check_output(self):
     if core.is_compiled_with_cuda():
         place = core.CUDAPlace(0)
         self.check_output_with_place(place, atol=1e-3)
Ejemplo n.º 21
0
    def check_forward_backward(self,
                               shape,
                               begin_norm_axis,
                               has_scale=True,
                               has_bias=True,
                               y_grad_scale=1.0,
                               use_mkldnn=False):
        def test_with_place(place,
                            shape,
                            begin_norm_axis,
                            use_mkldnn=use_mkldnn):
            # attr
            epsilon = 0.00001
            x_shape = shape
            D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
            scale_shape = [D]

            np.random.seed(123)
            x = np.random.random_sample(x_shape).astype(np.float32)
            scale = np.random.random_sample(scale_shape).astype(
                np.float32) if has_scale else None
            bias = np.random.random_sample(scale_shape).astype(
                np.float32) if has_bias else None
            y_grad = (np.random.random_sample(x_shape) * y_grad_scale).astype(
                np.float32)

            # reference forward & backward
            y, mean, variance = _reference_layer_norm_naive(
                x, scale, bias, epsilon, begin_norm_axis)
            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
                x, y_grad, scale, bias, mean, variance, begin_norm_axis)

            var_dict = locals()
            var_dict['y@GRAD'] = y_grad
            var_names = ['x', 'mean', 'variance', 'y', 'y@GRAD']
            if has_scale:
                var_names += ['scale']
            if has_bias:
                var_names += ['bias']
            ground_truth = {name: var_dict[name] for name in var_names}

            program = fluid.Program()
            with fluid.program_guard(program):
                block = program.global_block()
                for name in ground_truth:
                    block.create_var(name=name,
                                     dtype='float32',
                                     shape=ground_truth[name].shape)
                inputs = {"X": block.var('x')}
                fetch_list = [
                    'y',
                    'mean',
                    'variance',
                    'x@GRAD',
                ]
                if has_scale:
                    inputs["Scale"] = block.var('scale')
                    fetch_list += ['scale@GRAD']
                if has_bias:
                    inputs["Bias"] = block.var('bias')
                    fetch_list += ['bias@GRAD']
                layer_norm_op = block.append_op(
                    type="layer_norm",
                    inputs=inputs,
                    outputs={
                        "Y": block.var('y'),
                        "Mean": block.var('mean'),  # share the same memory
                        "Variance":
                        block.var('variance'),  # share the same memory
                    },
                    attrs={
                        "epsilon": epsilon,
                        "begin_norm_axis": begin_norm_axis,
                        "use_mkldnn": use_mkldnn
                    })
                # generate backward op_desc
                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
                    layer_norm_op.desc, set(), [])
                grad_op_desc = grad_op_desc_list[0]
                new_op_desc = block.desc.append_op()
                new_op_desc.copy_from(grad_op_desc)
                for var_name in grad_op_desc.output_arg_names():
                    block.desc.var(var_name.encode("ascii"))
                grad_op_desc.infer_var_type(block.desc)
                grad_op_desc.infer_shape(block.desc)
                for arg in grad_op_desc.output_arg_names():
                    grad_var = block.desc.find_var(arg.encode("ascii"))
                    grad_var.set_dtype(core.VarDesc.VarType.FP32)

                program._sync_with_cpp()
                exe = fluid.Executor(place)
                out = exe.run(program,
                              feed={
                                  name: var_dict[name]
                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
                              },
                              fetch_list=fetch_list)
                self.__assert_close(y, out[0], "y")
                self.__assert_close(mean, out[1], "mean")
                self.__assert_close(variance, out[2], "variance", 1e-3)
                self.__assert_close(x_grad, out[3], "x_grad")
                if has_scale:
                    self.__assert_close(scale_grad,
                                        out[fetch_list.index('scale@GRAD')],
                                        "scale_grad", 1e-3)
                if has_bias:
                    self.__assert_close(bias_grad,
                                        out[fetch_list.index('bias@GRAD')],
                                        "bias_grad")

        places = [core.CPUPlace()]
        if core.is_compiled_with_cuda() and core.op_support_gpu(
                "layer_norm") and self.use_cudnn:
            places.append(core.CUDAPlace(0))

        for place in places:
            test_with_place(place, shape, begin_norm_axis)
Ejemplo n.º 22
0
 def test_check_grad(self):
     if core.is_compiled_with_cuda():
         place = core.CUDAPlace(0)
         self.check_grad_with_place(place, ['Updates'],
                                    'Out',
                                    in_place=True)
Ejemplo n.º 23
0
 def test_check_grad_normal(self):
     place = core.CUDAPlace(0)
     self.check_grad_with_place(place, ['X', 'Y'], 'Out', check_eager=False)
 def test_check_grad(self):
     self.check_grad_with_place(core.CUDAPlace(0), ["Logits"], "Loss")
Ejemplo n.º 25
0
 def test_check_output(self):
     if self.use_cudnn:
         place = core.CUDAPlace(0)
         self.check_output_with_place(place, atol=1e-5)
     else:
         self.check_output()
 def test_check_output(self):
     self.check_output_with_place(core.CUDAPlace(0), atol=5e-2)
Ejemplo n.º 27
0
    def test_slice(self):
        place = fluid.CPUPlace()
        self._test_slice(place)

        if core.is_compiled_with_cuda():
            self._test_slice(core.CUDAPlace(0))
 def test_check_grad(self):
     self.check_grad_with_place(core.CUDAPlace(0), ["Logits"],
                                "Loss",
                                numeric_grad_delta=6e-1,
                                max_relative_error=6e-1)
Ejemplo n.º 29
0
 def test_check_grad_ignore_uv(self):
     place = core.CUDAPlace(0)
     if core.is_float16_supported(place):
         self.check_grad_with_place(place, ['X'], 'Out')
Ejemplo n.º 30
0
 def test_check_grad(self):
     place = core.CUDAPlace(0)
     if core.is_float16_supported(place):
         self.check_grad(['x0'], 'Out', max_relative_error=0.15)