Beispiel #1
0
 def train_loop(exe, train_program, trainer_id):
     total_time = 0.
     for pass_id in xrange(conf.num_passes):
         train_pass_acc_evaluator.reset()
         start_time = time.time()
         total_samples = 0
         with profiler.profiler("CPU",
                                'total',
                                profile_path='./profile_res_%d' %
                                trainer_id) as prof:
             for batch_id, data in enumerate(train_reader()):
                 batch_start = time.time()
                 cost_val, acc_val, size_val = exe.run(
                     train_program,
                     feed=feeder.feed(data),
                     fetch_list=[avg_cost, batch_acc_var, batch_size_var])
                 train_pass_acc_evaluator.add(value=acc_val,
                                              weight=size_val)
                 total_samples += float(size_val)
                 if batch_id and batch_id % conf.log_period == 0:
                     print(
                         "Pass id: %d, batch id: %d, cost: %f, pass_acc: %f, speed: %f, time: %f"
                         %
                         (pass_id, batch_id, cost_val,
                          train_pass_acc_evaluator.eval(), float(size_val) /
                          (time.time() - batch_start),
                          time.time() - batch_start))
         end_time = time.time()
         total_time += (end_time - start_time)
         pass_test_acc = test(exe)
         print("Pass id: %d, test_acc: %f, speed: %f" %
               (pass_id, pass_test_acc, total_samples /
                (end_time - start_time)))
     print("Total train time: %f" % (total_time))
    def _run_test_impl_(self,
                        callback,
                        feed,
                        fetch,
                        place,
                        use_parallel=False,
                        use_nccl=False,
                        use_gpu=False):
        """
        Run a single test, returns the fetch values
        Args:
            place(Place): the computation place. 
            use_parallel(bool): Whether use parallel.for or not. 

        Returns:
            Fetched numpy arrays.

        """
        if isinstance(fetch, basestring):
            fetch = [fetch]
        main = fluid.Program()
        startup = fluid.Program()
        # Fix seed
        main.random_seed = 10
        startup.random_seed = 10

        with fluid.program_guard(main, startup):
            generator = callback()
            # Automatically insert parallel do if use_parallel = True
            if use_parallel:
                places = fluid.layers.get_places()
                pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
                data = next(generator)

                if isinstance(data, fluid.Variable):
                    data = [data]

                with pd.do():
                    ins = map(pd.read_input, data)
                    if len(ins) == 1:
                        ins = ins[0]
                    loss = generator.send(ins)  # patch input
                    pd.write_output(loss)

                loss = pd()
            else:
                data = next(generator)
                loss = generator.send(data)
            self.assertIsNotNone(loss)
            avg_loss = fluid.layers.mean(loss)
            fluid.backward.append_backward(loss=avg_loss)

        exe = fluid.Executor(place)
        exe.run(startup)
        if use_gpu:
            profile_type = 'GPU'
        else:
            profile_type = 'CPU'
        with profiler.profiler(profile_type, 'total', '/tmp/profiler'):
            return exe.run(main, feed=feed, fetch_list=fetch)
Beispiel #3
0
    def net_profiler(self, state, profile_path='/tmp/profile'):
        enable_if_gpu = state == 'GPU' or state == "All"
        if enable_if_gpu and not core.is_compiled_with_cuda():
            return
        startup_program = fluid.Program()
        main_program = fluid.Program()

        with fluid.program_guard(main_program, startup_program):
            image = fluid.layers.data(name='x', shape=[784], dtype='float32')
            hidden1 = fluid.layers.fc(input=image, size=64, act='relu')
            i = layers.zeros(shape=[1], dtype='int64')
            counter = fluid.layers.zeros(shape=[1],
                                         dtype='int64',
                                         force_cpu=True)
            until = layers.fill_constant([1], dtype='int64', value=10)
            data_arr = layers.array_write(hidden1, i)
            cond = fluid.layers.less_than(x=counter, y=until)
            while_op = fluid.layers.While(cond=cond)
            with while_op.block():
                hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu')
                layers.array_write(hidden_n, i, data_arr)
                fluid.layers.increment(x=counter, value=1, in_place=True)
                layers.less_than(x=counter, y=until, cond=cond)

            hidden_n = layers.array_read(data_arr, i)
            hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu')
            predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
            label = fluid.layers.data(name='y', shape=[1], dtype='int64')
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(cost)
            batch_size = fluid.layers.create_tensor(dtype='int64')
            batch_acc = fluid.layers.accuracy(input=predict,
                                              label=label,
                                              total=batch_size)

        optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
        opts = optimizer.minimize(avg_cost, startup_program=startup_program)

        place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
        exe = fluid.Executor(place)
        exe.run(startup_program)

        pass_acc_calculator = fluid.average.WeightedAverage()
        with profiler.profiler(state, 'total', profile_path) as prof:
            for iter in range(10):
                if iter == 2:
                    profiler.reset_profiler()
                x = np.random.random((32, 784)).astype("float32")
                y = np.random.randint(0, 10, (32, 1)).astype("int64")

                outs = exe.run(main_program,
                               feed={
                                   'x': x,
                                   'y': y
                               },
                               fetch_list=[avg_cost, batch_acc, batch_size])
                acc = np.array(outs[1])
                b_size = np.array(outs[2])
                pass_acc_calculator.add(value=acc, weight=b_size)
                pass_acc = pass_acc_calculator.eval()
Beispiel #4
0
    def train_loop(exe, trainer_prog, reader):
        exe.run(fluid.default_startup_program())

        for epoch_id in xrange(epoch_num):
            print "epoch_%d start" % epoch_id
            pass_start = time.time()
            i = 0
            with profiler.profiler(
                    "All", 'total',
                    profile_path="/usr/local/nvidia/lib64/tmp") as prof:
                for data in reader():
                    i += 1
                    lod_src_wordseq = to_lodtensor(map(lambda x: x[0], data),
                                                   place)
                    lod_dst_wordseq = to_lodtensor(map(lambda x: x[1], data),
                                                   place)
                    ret_average_cost = exe.run(trainer_prog,
                                               feed={
                                                   "src_wordseq":
                                                   lod_src_wordseq,
                                                   "dst_wordseq":
                                                   lod_dst_wordseq
                                               },
                                               fetch_list=[average_cost])
                    average_ppl = math.exp(ret_average_cost[0])
                    if i % 100 == 0:
                        print "step %d ppl: %.3f" % (i, average_ppl)
            print "total steps:", i
            spent = time.time() - pass_start
            print("Pass %d end, spent: %f, speed: %f" %
                  (epoch_id, spent, 42068 / spent))
Beispiel #5
0
    def net_profiler(self, state, profile_path='/tmp/profile'):
        enable_if_gpu = state == 'GPU' or state == "All"
        if enable_if_gpu and not core.is_compiled_with_cuda():
            return
        startup_program = fluid.Program()
        main_program = fluid.Program()

        with fluid.program_guard(main_program, startup_program):
            image = fluid.layers.data(name='x', shape=[784], dtype='float32')
            hidden1 = fluid.layers.fc(input=image, size=64, act='relu')
            i = layers.zeros(shape=[1], dtype='int64')
            counter = fluid.layers.zeros(
                shape=[1], dtype='int64', force_cpu=True)
            until = layers.fill_constant([1], dtype='int64', value=10)
            data_arr = layers.array_write(hidden1, i)
            cond = fluid.layers.less_than(x=counter, y=until)
            while_op = fluid.layers.While(cond=cond)
            with while_op.block():
                hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu')
                layers.array_write(hidden_n, i, data_arr)
                fluid.layers.increment(x=counter, value=1, in_place=True)
                layers.less_than(x=counter, y=until, cond=cond)

            hidden_n = layers.array_read(data_arr, i)
            hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu')
            predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
            label = fluid.layers.data(name='y', shape=[1], dtype='int64')
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(cost)
            batch_size = fluid.layers.create_tensor(dtype='int64')
            batch_acc = fluid.layers.accuracy(
                input=predict, label=label, total=batch_size)

        optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
        opts = optimizer.minimize(avg_cost, startup_program=startup_program)

        place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
        exe = fluid.Executor(place)
        exe.run(startup_program)

        pass_acc_calculator = fluid.average.WeightedAverage()
        with profiler.profiler(state, 'total', profile_path) as prof:
            for iter in range(10):
                if iter == 2:
                    profiler.reset_profiler()
                x = np.random.random((32, 784)).astype("float32")
                y = np.random.randint(0, 10, (32, 1)).astype("int64")

                outs = exe.run(main_program,
                               feed={'x': x,
                                     'y': y},
                               fetch_list=[avg_cost, batch_acc, batch_size])
                acc = np.array(outs[1])
                b_size = np.array(outs[2])
                pass_acc_calculator.add(value=acc, weight=b_size)
                pass_acc = pass_acc_calculator.eval()
def profile_context(profile=True):
    """
        profile_context
    """
    if profile:
        with profiler.profiler('All', 'total',
                               './profile_dir/profile_file_tmp'):
            yield
    else:
        yield
Beispiel #7
0
    def train_loop(exe, trainer_prog):
        iters = 0
        ts = time.time()
        train_pass_acc = fluid.average.WeightedAverage()
        for pass_id in range(args.num_passes):
            # train
            start_time = time.time()
            num_samples = 0
            train_pass_acc.reset()

            def run_step(batch_id, data):
                img_data = np.array(
                    map(lambda x: x[0].reshape(data_shape),
                        data)).astype("float32")
                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                y_data = y_data.reshape([-1, 1])

                loss, acc, b_size = exe.run(
                    trainer_prog,
                    feed={
                        "pixel": img_data,
                        "label": y_data
                    },
                    fetch_list=[avg_cost, batch_acc, batch_size])
                return loss, acc, b_size

            if args.profile and args.task_index == 0:
                # warmup.
                for batch_id, data in enumerate(train_reader()):
                    if batch_id > 5: break
                    run_step(batch_id, data)
                with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
                    for batch_id, data in enumerate(train_reader()):
                        if batch_id > 5: break
                        run_step(batch_id, data)

            for batch_id, data in enumerate(train_reader()):
                ts = time.time()
                loss, acc, b_size = run_step(batch_id, data)
                iters += 1
                num_samples += len(data)
                train_pass_acc.add(value=acc, weight=b_size)
                print(
                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
                    "Speed = %.2f img/s" %
                    (pass_id, iters, loss, acc, len(data) / (time.time() - ts))
                )  # The accuracy is the accumulation of batches, but not the current batch.

            pass_elapsed = time.time() - start_time
            pass_train_acc = train_pass_acc.eval()
            pass_test_acc = test(exe)
            print("Task:%d Pass = %d, Training performance = %f imgs/s, "
                  "Train accuracy = %f, Test accuracy = %f\n" %
                  (args.task_index, pass_id, num_samples / pass_elapsed,
                   pass_train_acc, pass_test_acc))
Beispiel #8
0
def main():
    args = parser.parse_args()
    print_arguments(args)
    if args.profile:
        if args.use_gpu:
            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
                train(args)
        else:
            with profiler.profiler("CPU", sorted_key='total') as cpuprof:
                train(args)
    else:
        train(args)
Beispiel #9
0
    def net_profiler(self,
                     exe,
                     state,
                     tracer_option,
                     batch_range=None,
                     use_parallel_executor=False,
                     use_new_api=False):
        main_program, startup_program, avg_cost, batch_size, batch_acc = self.build_program(
            compile_program=use_parallel_executor)
        exe.run(startup_program)

        profile_path = self.get_profile_path()
        if not use_new_api:
            with profiler.profiler(state, 'total', profile_path, tracer_option):
                pass_acc_calculator = fluid.average.WeightedAverage()
                for iter in range(10):
                    if iter == 2:
                        profiler.reset_profiler()
                    self.run_iter(exe, main_program,
                                  [avg_cost, batch_acc, batch_size],
                                  pass_acc_calculator)
        else:
            options = utils.ProfilerOptions(options={
                'state': state,
                'sorted_key': 'total',
                'tracer_level': tracer_option,
                'batch_range': [0, 10] if batch_range is None else batch_range,
                'profile_path': profile_path
            })
            with utils.Profiler(enabled=True, options=options) as prof:
                pass_acc_calculator = fluid.average.WeightedAverage()
                for iter in range(10):
                    self.run_iter(exe, main_program,
                                  [avg_cost, batch_acc, batch_size],
                                  pass_acc_calculator)
                    utils.get_profiler().record_step()
                    if batch_range is None and iter == 2:
                        utils.get_profiler().reset()

        self.check_profile_result(profile_path)
    def train_loop(exe, trainer_prog):
        iters = 0
        ts = time.time()
        for pass_id in range(args.num_passes):
            # train
            start_time = time.time()
            num_samples = 0
            accuracy.reset(exe)
            with profiler.profiler(args.device, 'total') as prof:
                for batch_id, data in enumerate(train_reader()):
                    ts = time.time()
                    img_data = np.array(
                        map(lambda x: x[0].reshape(data_shape),
                            data)).astype("float32")
                    y_data = np.array(map(lambda x: x[1],
                                          data)).astype("int64")
                    y_data = y_data.reshape([-1, 1])

                    loss, acc = exe.run(trainer_prog,
                                        feed={
                                            "pixel": img_data,
                                            "label": y_data
                                        },
                                        fetch_list=[avg_cost] +
                                        accuracy.metrics)
                    iters += 1
                    num_samples += len(data)
                    print(
                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
                        % (pass_id, iters, loss, acc, time.time() - ts)
                    )  # The accuracy is the accumulation of batches, but not the current batch.

            pass_elapsed = time.time() - start_time
            pass_train_acc = accuracy.eval(exe)
            pass_test_acc = test(exe)
            print(
                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
                   pass_test_acc))
Beispiel #11
0
    def train_loop(exe, trainer_prog, trainer_id=0, reader=train_reader):
        embedding_name = 'emb'
        embedding_param = fluid.global_scope().find_var(
            embedding_name).get_tensor()
        embedding_param.set(word_vector_values, place)

        batch_id = 0
        for pass_id in xrange(num_passes):
            chunk_evaluator.reset(exe)
            start_time = time.time()
            with profiler.profiler(
                    "CPU", 'total',
                    profile_path="/usr/local/nvidia/lib64/tmp") as prof:
                for data in reader():
                    cost, batch_precision, batch_recall, batch_f1_score = exe.run(
                        trainer_prog,
                        feed=feeder.feed(data),
                        fetch_list=[avg_cost] + chunk_evaluator.metrics)
                    if batch_id % 5 == 0:
                        print("Pass " + str(pass_id) + ", Batch " +
                              str(batch_id) + ", Cost " + str(cost[0]) +
                              ", Precision " + str(batch_precision[0]) +
                              ", Recall " + str(batch_recall[0]) +
                              ", F1_score" + str(batch_f1_score[0]))
                    batch_id = batch_id + 1

                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
                    exe)
                spent = time.time() - start_time
                print("pass_id: %d, precision: %f, recall: %f, f1: %f, spent: %f, speed: %f" % \
                      (pass_id, pass_precision, pass_recall, pass_f1_score,
                      spent, 14987.0 / spent))
                pass_precision, pass_recall, pass_f1_score = test(
                    exe, chunk_evaluator, inference_program, test_reader,
                    place)
                print("[TestSet] pass_id:" + str(pass_id) +
                      " pass_precision:" + str(pass_precision) +
                      " pass_recall:" + str(pass_recall) + " pass_f1_score:" +
                      str(pass_f1_score))
Beispiel #12
0
    def train_loop(use_gpu, trainer_prog, trainer_id=0):
        place = core.CPUPlace() if not use_gpu else core.CUDAPlace(0)
        iters = 0
        accuracy = fluid.average.WeightedAverage()
        start_time = time.time()
        num_samples = 0
        accuracy.reset()
        feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
        exe = fluid.Executor(place)

        for pass_id in range(args.num_passes):
            # train
            start_time = time.time()
            num_samples = 0
            with profiler.profiler("All", 'total', profile_path="/usr/local/nvidia/lib64/tmp") as prof:
                for batch_id, data in enumerate(train_reader()):
                    batch_st = time.time()
                    loss, acc, weight = exe.run(
                        trainer_prog,
                        feed=feeder.feed(data),
                        fetch_list=[avg_cost, batch_acc, batch_size_tensor])
                    accuracy.add(value=acc, weight=weight)
                    iters += 1
                    num_samples += len(data)
                    print(
                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, batch spent %f" %
                        (pass_id, iters, loss, acc, time.time() - batch_st)
                    )  # The accuracy is the accumulation of batches, but not the current batch.

            pass_elapsed = time.time() - start_time
            pass_train_acc = accuracy.eval()
            pass_test_acc = test(exe)
            print(
                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
                pass_test_acc))
Beispiel #13
0
                                                                        optimizer._global_learning_rate().numpy(),
                                                                        total_loss / args.batch_size / args.log_period))

                    total_loss = 0.0

                if total_step > 0 and total_step % args.save_model_period == 0:
                    if fluid.dygraph.parallel.Env().dev_id == 0:
                        model_file = os.path.join(args.save_model_dir, 'step_{}'.format(total_step))
                        fluid.save_dygraph(ocr_attention.state_dict(), model_file)
                        print('step_{}.pdparams saved!'.format(total_step))
                if total_step > 0 and total_step % args.eval_period == 0:
                    ocr_attention.eval()
                    evaluate(ocr_attention, test_reader, args.batch_size)
                    ocr_attention.train()

                batch_id += 1


if __name__ == '__main__':
    args = parser.parse_args()
    print_arguments(args)
    if args.profile:
        if args.use_gpu:
            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
                train(args)
        else:
            with profiler.profiler("CPU", sorted_key='total') as cpuprof:
                train(args)
    else:
        train(args)
Beispiel #14
0
def profile_context(profile=True):
    if profile:
        with profiler.profiler('All', 'total', '/tmp/paddingrnn.profile'):
            yield
    else:
        yield
Beispiel #15
0
def profile(args):
    """profile the training process.
    """

    if not args.first_batches_to_skip < args.max_batch_num:
        raise ValueError("arg 'first_batches_to_skip' must be smaller than "
                         "'max_batch_num'.")
    if not args.first_batches_to_skip >= 0:
        raise ValueError(
            "arg 'first_batches_to_skip' must not be smaller than 0.")

    _, avg_cost, accuracy = stacked_lstmp_model(frame_dim=args.frame_dim,
                                                hidden_dim=args.hidden_dim,
                                                proj_dim=args.proj_dim,
                                                stacked_num=args.stacked_num,
                                                class_num=args.class_num,
                                                parallel=args.parallel)

    optimizer = fluid.optimizer.Adam(
        learning_rate=fluid.layers.exponential_decay(
            learning_rate=args.learning_rate,
            decay_steps=1879,
            decay_rate=1 / 1.2,
            staircase=True))
    optimizer.minimize(avg_cost)

    place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    ltrans = [
        trans_add_delta.TransAddDelta(2, 2),
        trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var),
        trans_splice.TransSplice(5, 5),
        trans_delay.TransDelay(5)
    ]

    data_reader = reader.AsyncDataReader(args.feature_lst,
                                         args.label_lst,
                                         -1,
                                         split_sentence_threshold=1024)
    data_reader.set_transformers(ltrans)

    feature_t = fluid.LoDTensor()
    label_t = fluid.LoDTensor()

    sorted_key = None if args.sorted_key is 'None' else args.sorted_key
    with profiler.profiler(args.device, sorted_key) as prof:
        frames_seen, start_time = 0, 0.0
        for batch_id, batch_data in enumerate(
                data_reader.batch_iterator(args.batch_size,
                                           args.minimum_batch_size)):
            if batch_id >= args.max_batch_num:
                break
            if args.first_batches_to_skip == batch_id:
                profiler.reset_profiler()
                start_time = time.time()
                frames_seen = 0
            # load_data
            (features, labels, lod, _) = batch_data
            features = np.reshape(features, (-1, 11, 3, args.frame_dim))
            features = np.transpose(features, (0, 2, 1, 3))
            feature_t.set(features, place)
            feature_t.set_lod([lod])
            label_t.set(labels, place)
            label_t.set_lod([lod])

            frames_seen += lod[-1]

            outs = exe.run(fluid.default_main_program(),
                           feed={
                               "feature": feature_t,
                               "label": label_t
                           },
                           fetch_list=[avg_cost, accuracy]
                           if args.print_train_acc else [],
                           return_numpy=False)

            if args.print_train_acc:
                print("Batch %d acc: %f" %
                      (batch_id, lodtensor_to_ndarray(outs[1])[0]))
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
        time_consumed = time.time() - start_time
        frames_per_sec = frames_seen / time_consumed
        print("\nTime consumed: %f s, performance: %f frames/s." %
              (time_consumed, frames_per_sec))
Beispiel #16
0
def train_parallel_exe(args):

    class_dim = 1000
    image_shape = [3, 224, 224]

    if args.use_recordio:
        reader = fluid.layers.open_recordio_file(
            filename='./flowers_bs_12_3_224_224.recordio',
            shapes=[[-1, 3, 224, 224], [-1, 1]],
            lod_levels=[0, 0],
            dtypes=['float32', 'int64'])
        image, label = fluid.layers.read_file(reader)
    else:
        image = fluid.layers.data(
            name='image', shape=image_shape, dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')

        place = fluid.CUDAPlace(0)
        feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
        train_reader = feeder.decorate_reader(
            paddle.batch(
                train() if args.use_fake_reader else flowers.train(),
                batch_size=args.batch_size_per_gpu),
            multi_devices=True)

        train_reader_iter = train_reader()
        if args.fix_data_in_gpu:
            data = train_reader_iter.next()
            feed_data = data

    prediction, avg_cost, accuracy, accuracy5 = net_conf(image, label,
                                                         class_dim)

    add_optimizer(args, avg_cost)

    place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.allow_op_delay = True

    build_strategy = fluid.BuildStrategy()
    build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if args.balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce

    exe = fluid.ParallelExecutor(
        loss_name=avg_cost.name,
        use_cuda=True,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    time_record = []
    train_start = time.time()
    img_count = 0
    for batch_id in xrange(args.number_iteration):
        if args.do_profile and batch_id >= 5 and batch_id < 8:
            with profiler.profiler('All', 'total',
                                   '/tmp/profile_parallel_exe') as prof:
                if args.use_recordio:
                    exe.run([])
                else:
                    exe.run([],
                            feed=feed_data if args.fix_data_in_gpu else
                            train_reader_iter.next())
            continue

        if args.use_recordio:
            cost_val = exe.run([avg_cost.name] if (batch_id + 1) %
                               args.display_step == 0 else [])
        else:
            cost_val = exe.run(
                [avg_cost.name]
                if (batch_id + 1) % args.display_step == 0 else [],
                feed=feed_data
                if args.fix_data_in_gpu else train_reader_iter.next())

        img_count += args.batch_size

        if (batch_id + 1) % args.display_step == 0:
            train_stop = time.time()
            step_time = train_stop - train_start
            time_record.append(step_time)

            print("iter=%d, cost=%s, elapse=%f, img/sec=%f" %
                  ((batch_id + 1), np.array(cost_val[0]), step_time,
                   img_count / step_time))

            img_count = 0
            train_start = time.time()

    skip_time_record = args.skip_first_steps / args.display_step
    time_record[0:skip_time_record] = []

    if args.show_record_time:
        for i, ele in enumerate(time_record):
            print("iter:{0}, time consume:{1}".format(i, ele))

    img_count = (
        args.number_iteration - args.skip_first_steps) * args.batch_size

    print("average time:{0}, img/sec:{1}".format(
        np.mean(time_record), img_count / np.sum(time_record)))
Beispiel #17
0
def profile_context(profile=True, profiler_path='./seq2seq.profile'):
    if profile:
        with profiler.profiler('All', 'total', profiler_path):
            yield
    else:
        yield
Beispiel #18
0
    def train_loop(exe, trainer_prog):
        iters = 0
        ts = time.time()
        train_pass_acc = fluid.average.WeightedAverage()
        acc_4passes = None
        converge_speed = None
        for pass_id in range(args.num_passes):
            # train
            start_time = time.time()
            num_samples = 0
            train_pass_acc.reset()

            def run_step(batch_id, data):
                img_data = np.array(
                    map(lambda x: x[0].reshape(data_shape),
                        data)).astype("float32")
                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                y_data = y_data.reshape([-1, 1])

                loss, acc, b_size = exe.run(
                    trainer_prog,
                    feed={
                        "pixel": img_data,
                        "label": y_data
                    },
                    fetch_list=[avg_cost, batch_acc, batch_size])
                return loss, acc, b_size

            if args.profile and args.task_index == 0:
                # warmup.
                for batch_id, data in enumerate(train_reader()):
                    if batch_id > 5: break
                    run_step(batch_id, data)
                with profiler.profiler('All', 'total', '/tmp/profile_vgg'):
                    for batch_id, data in enumerate(train_reader()):
                        if batch_id > 5: break
                        run_step(batch_id, data)

            for batch_id, data in enumerate(train_reader()):
                ts = time.time()
                loss, acc, b_size = run_step(batch_id, data)
                iters += 1
                num_samples += len(data)
                train_pass_acc.add(value=acc, weight=b_size)
                print(
                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, "
                    "Speed = %.2f img/s" %
                    (pass_id, iters, loss, acc, len(data) / (time.time() - ts))
                )  # The accuracy is the accumulation of batches, but not the current batch.
                # terminate training when acc_target reaches
                if args.acc_target and acc >= args.acc_target:
                    converge_speed = time.time() - start_time

            pass_elapsed = time.time() - start_time
            pass_train_acc = train_pass_acc.eval()
            pass_test_acc = test(exe)

            if pass_id == 4:
                acc_4passes = pass_train_acc

            msgs = []
            msgs.append("pass = %d" % pass_id)
            msgs.append("train_speed = %f" % num_samples / pass_elapsed)
            msgs.append("train_accuracy = %f" % pass_train_acc)
            msgs.append("test_accuracy = %f" % pass_test_acc)
            if isinstance(acc_4passes, float):
                msgs.append("acc_4passes = %f" % acc_4passes)
            if isinstance(converge_speed, int):
                msgs.append("converge_speed = %d" % converge_speed)

            print("**metrics_data: " + ", ".join(msgs))
Beispiel #19
0
            sys.stdout.flush()
            batch_id += 1
            total_batch_num = total_batch_num + 1  # this is for benchmark
            # profiler tools for benchmark
            if args.profile and epoch == 0 and batch_id == 10:
                profiler.reset_profiler()
            elif args.profile and epoch == 0 and batch_id == 15:
                return

        if args.run_test and not args.run_ce:
            test(epoch)
        if args.save_checkpoints and not args.run_ce:
            checkpoints(epoch)
    if args.run_ce:
        print("kpis,g_train_cost,{}".format(np.mean(losses[0])))
        print("kpis,d_train_cost,{}".format(np.mean(losses[1])))
        print("kpis,duration,{}".format(t_time / args.epoch))


if __name__ == "__main__":
    args = parser.parse_args()
    print_arguments(args)
    if args.profile:
        if args.use_gpu:
            with profiler.profiler('All', 'total', args.profiler_path) as prof:
                train(args)
        else:
            with profiler.profiler("CPU", sorted_key='total') as cpuprof:
                train(args)
    else:
        train(args)
    sgd_optimizer.minimize(avg_cost)

    # The training data set.
    train_reader = paddle.batch(paddle.reader.shuffle(
        paddle.dataset.imdb.train(word_dict), buf_size=51200),
                                batch_size=conf.batch_size)

    # The testing data set.
    test_reader = paddle.batch(paddle.reader.shuffle(
        paddle.dataset.imdb.test(word_dict), buf_size=51200),
                               batch_size=conf.batch_size)

    if conf.use_gpu:
        place = fluid.CUDAPlace(0)
    else:
        place = fluid.CPUPlace()

    exe = fluid.Executor(place)

    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)

    exe.run(fluid.default_startup_program())

    print("Done Inferring.")


if __name__ == '__main__':
    args = parse_args()
    with profiler.profiler("GPU", 'total') as prof:
        main(args.dict_path)
Beispiel #21
0
def train():
    learning_rate = cfg.learning_rate
    image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size]
    num_iterations = cfg.max_iter

    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
    total_batch_size = devices_num * cfg.TRAIN.im_per_batch
    model = model_builder.RCNN(
        add_conv_body_func=resnet.add_ResNet50_conv4_body,
        add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
        use_pyreader=cfg.use_pyreader,
        use_random=False)
    model.build_model(image_shape)
    losses, keys = model.loss()
    loss = losses[0]
    fetch_list = [loss]

    boundaries = cfg.lr_steps
    gamma = cfg.lr_gamma
    step_num = len(cfg.lr_steps)
    values = [learning_rate * (gamma**i) for i in range(step_num + 1)]

    optimizer = fluid.optimizer.Momentum(
        learning_rate=exponential_with_warmup_decay(
            learning_rate=learning_rate,
            boundaries=boundaries,
            values=values,
            warmup_iter=500,
            warmup_factor=1.0 / 3.0),
        regularization=fluid.regularizer.L2Decay(0.0001),
        momentum=0.9)
    optimizer.minimize(loss)

    fluid.memory_optimize(fluid.default_main_program())

    place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    if cfg.pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(cfg.pretrained_model, var.name))

        fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist)

    if cfg.parallel:
        train_exe = fluid.ParallelExecutor(use_cuda=bool(cfg.use_gpu),
                                           loss_name=loss.name)

    if cfg.use_pyreader:
        train_reader = reader.train(batch_size=cfg.TRAIN.im_per_batch,
                                    total_batch_size=total_batch_size,
                                    padding_total=cfg.TRAIN.padding_minibatch,
                                    shuffle=False)
        py_reader = model.py_reader
        py_reader.decorate_paddle_reader(train_reader)
    else:
        train_reader = reader.train(batch_size=total_batch_size, shuffle=False)
        feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())

    def run(iterations):
        reader_time = []
        run_time = []
        total_images = 0

        for batch_id in range(iterations):
            start_time = time.time()
            data = next(train_reader())
            end_time = time.time()
            reader_time.append(end_time - start_time)
            start_time = time.time()
            if cfg.parallel:
                outs = train_exe.run(fetch_list=[v.name for v in fetch_list],
                                     feed=feeder.feed(data))
            else:
                outs = exe.run(fluid.default_main_program(),
                               fetch_list=[v.name for v in fetch_list],
                               feed=feeder.feed(data))
            end_time = time.time()
            run_time.append(end_time - start_time)
            total_images += len(data)
            print("Batch {:d}, loss {:.6f} ".format(batch_id,
                                                    np.mean(outs[0])))
        return reader_time, run_time, total_images

    def run_pyreader(iterations):
        reader_time = [0]
        run_time = []
        total_images = 0

        py_reader.start()
        try:
            for batch_id in range(iterations):
                start_time = time.time()
                if cfg.parallel:
                    outs = train_exe.run(
                        fetch_list=[v.name for v in fetch_list])
                else:
                    outs = exe.run(fluid.default_main_program(),
                                   fetch_list=[v.name for v in fetch_list])
                end_time = time.time()
                run_time.append(end_time - start_time)
                total_images += devices_num
                print("Batch {:d}, loss {:.6f} ".format(
                    batch_id, np.mean(outs[0])))
        except fluid.core.EOFException:
            py_reader.reset()

        return reader_time, run_time, total_images

    run_func = run if not cfg.use_pyreader else run_pyreader

    # warm-up
    run_func(2)
    # profiling
    start = time.time()
    if cfg.use_profile:
        with profiler.profiler('GPU', 'total', '/tmp/profile_file'):
            reader_time, run_time, total_images = run_func(num_iterations)
    else:
        reader_time, run_time, total_images = run_func(num_iterations)

    end = time.time()
    total_time = end - start
    print(
        "Total time: {0}, reader time: {1} s, run time: {2} s, images/s: {3}".
        format(total_time, np.sum(reader_time), np.sum(run_time),
               total_images / total_time))
Beispiel #22
0
def profile_context(profile=True):
    if profile:
        with profiler.profiler('All', 'total', args.profiler_path):
            yield
    else:
        yield
Beispiel #23
0
def profile(args):
    print args

    if args.device == 'CPU':
        TrainTaskConfig.use_gpu = False

    if not TrainTaskConfig.use_gpu:
        place = fluid.CPUPlace()
        dev_count = multiprocessing.cpu_count()
    else:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()

    exe = fluid.Executor(place)

    sum_cost, avg_cost, predict, token_num = transformer(
        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
        ModelHyperParams.n_head, ModelHyperParams.d_key,
        ModelHyperParams.d_value, ModelHyperParams.d_model,
        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
                                         TrainTaskConfig.warmup_steps,
                                         TrainTaskConfig.learning_rate)

    optimizer = fluid.optimizer.Adam(learning_rate=lr_scheduler.learning_rate,
                                     beta1=TrainTaskConfig.beta1,
                                     beta2=TrainTaskConfig.beta2,
                                     epsilon=TrainTaskConfig.eps)
    optimizer.minimize(sum_cost)

    # Initialize the parameters.
    if TrainTaskConfig.ckpt_path:
        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
        lr_scheduler.current_steps = TrainTaskConfig.start_step
    else:
        exe.run(fluid.framework.default_startup_program())

    # Disable all sorts for they will be done in the 1st batch.
    train_data = reader.DataReader(
        src_vocab_fpath=args.src_vocab_fpath,
        trg_vocab_fpath=args.trg_vocab_fpath,
        fpattern=args.train_file_pattern,
        use_token_batch=args.use_token_batch,
        batch_size=args.batch_size *
        (1 if args.use_token_batch else dev_count),
        pool_size=args.pool_size,
        sort_type='none',
        shuffle=False,
        shuffle_batch=False,
        start_mark=args.special_token[0],
        end_mark=args.special_token[1],
        unk_mark=args.special_token[2],
        # count start and end tokens out
        max_length=ModelHyperParams.max_length - 2,
        clip_last_batch=False)
    train_data = read_multiple(reader=train_data.batch_generator,
                               count=dev_count if args.use_token_batch else 1)

    if dev_count > 1:
        build_strategy = fluid.BuildStrategy()
        build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
        train_exe = fluid.ParallelExecutor(
            use_cuda=TrainTaskConfig.use_gpu,
            loss_name=sum_cost.name,
            main_program=fluid.default_main_program(),
            build_strategy=build_strategy)

    print("Warming up ...")
    train_loop(exe if dev_count == 1 else train_exe,
               fluid.default_main_program(), False, 3, train_data, dev_count,
               sum_cost, avg_cost, lr_scheduler, token_num, predict)

    print("\nProfiling ...")
    if dev_count == 1:
        with profiler.profiler('All', 'total', '/tmp/profile_file'):
            total_time, exec_time = train_loop(exe,
                                               fluid.default_main_program(),
                                               True, args.num_iters,
                                               train_data, dev_count, sum_cost,
                                               avg_cost, lr_scheduler,
                                               token_num, predict)
    else:
        total_time, exec_time = train_loop(train_exe,
                                           fluid.default_main_program(), True,
                                           args.num_iters, train_data,
                                           dev_count, sum_cost, avg_cost,
                                           lr_scheduler, token_num, predict)
    print("Elapsed time: total %f s, in executor %f s" %
          (total_time, exec_time))
Beispiel #24
0
    latencies = batch_times[args.skip_pass_num:]
    latency_avg = np.average(latencies)
    latency_std = np.std(latencies)
    latency_pc99 = np.percentile(latencies, 99)
    wps_avg = np.average(wpses)
    wps_std = np.std(wpses)
    wps_pc01 = np.percentile(wpses, 1)

    # Benchmark output
    print('\nTotal passes (incl. warm-up): %d' % (total_passes))
    print('Total iterations (incl. warm-up): %d' % (all_iters))
    print('Total examples (incl. warm-up): %d' % (all_iters * args.batch_size))
    print('avg latency: %.5f, std latency: %.5f, 99pc latency: %.5f' %
          (latency_avg, latency_std, latency_pc99))
    print('avg wps: %.5f, std wps: %.5f, wps for 99pc latency: %.5f' %
          (wps_avg, wps_std, wps_pc01))


if __name__ == "__main__":
    args = parse_args()
    print_arguments(args)
    if args.profile:
        if args.device == 'GPU':
            with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
                infer(args)
        else:
            with profiler.profiler('CPU', sorted_key='total') as cpuprof:
                infer(args)
    else:
        infer(args)
Beispiel #25
0
    def net_profiler(self, state, use_parallel_executor=False):
        profile_path = os.path.join(tempfile.gettempdir(), "profile")
        open(profile_path, "w").write("")
        startup_program = fluid.Program()
        main_program = fluid.Program()

        with fluid.program_guard(main_program, startup_program):
            image = fluid.layers.data(name='x', shape=[784], dtype='float32')
            hidden1 = fluid.layers.fc(input=image, size=64, act='relu')
            i = layers.zeros(shape=[1], dtype='int64')
            counter = fluid.layers.zeros(
                shape=[1], dtype='int64', force_cpu=True)
            until = layers.fill_constant([1], dtype='int64', value=10)
            data_arr = layers.array_write(hidden1, i)
            cond = fluid.layers.less_than(x=counter, y=until)
            while_op = fluid.layers.While(cond=cond)
            with while_op.block():
                hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu')
                layers.array_write(hidden_n, i, data_arr)
                fluid.layers.increment(x=counter, value=1, in_place=True)
                layers.less_than(x=counter, y=until, cond=cond)

            hidden_n = layers.array_read(data_arr, i)
            hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu')
            predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
            label = fluid.layers.data(name='y', shape=[1], dtype='int64')
            cost = fluid.layers.cross_entropy(input=predict, label=label)
            avg_cost = fluid.layers.mean(cost)
            batch_size = fluid.layers.create_tensor(dtype='int64')
            batch_acc = fluid.layers.accuracy(
                input=predict, label=label, total=batch_size)

        optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
        opts = optimizer.minimize(avg_cost, startup_program=startup_program)

        place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
        exe = fluid.Executor(place)
        exe.run(startup_program)
        if use_parallel_executor:
            pe = fluid.ParallelExecutor(
                state != 'CPU',
                loss_name=avg_cost.name,
                main_program=main_program)

        pass_acc_calculator = fluid.average.WeightedAverage()
        with profiler.profiler(state, 'total', profile_path) as prof:
            for iter in range(10):
                if iter == 2:
                    profiler.reset_profiler()
                x = np.random.random((32, 784)).astype("float32")
                y = np.random.randint(0, 10, (32, 1)).astype("int64")

                if use_parallel_executor:
                    pe.run(feed={'x': x, 'y': y}, fetch_list=[avg_cost.name])
                    continue
                outs = exe.run(main_program,
                               feed={'x': x,
                                     'y': y},
                               fetch_list=[avg_cost, batch_acc, batch_size])
                acc = np.array(outs[1])
                b_size = np.array(outs[2])
                pass_acc_calculator.add(value=acc, weight=b_size)
                pass_acc = pass_acc_calculator.eval()
        data = open(profile_path, 'rb').read()
        self.assertGreater(len(data), 0)
        profile_pb = profiler_pb2.Profile()
        profile_pb.ParseFromString(data)
        self.assertGreater(len(profile_pb.events), 0)
        for event in profile_pb.events:
            if event.type == profiler_pb2.Event.GPUKernel:
                if not event.detail_info and not event.name.startswith("MEM"):
                    raise Exception(
                        "Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
                        % event.name)
            elif event.type == profiler_pb2.Event.CPU and (
                    event.name.startswith("Driver API") or
                    event.name.startswith("Runtime API")):
                print("Warning: unregister", event.name)
Beispiel #26
0
def profile_context(profile=True):
    if profile:
        with profiler.profiler('All', 'total', 'seq2seq.profile'):
            yield
    else:
        yield
Beispiel #27
0
def train_parallel_do(args):

    class_dim = 1000
    image_shape = [3, 224, 224]

    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')

    places = fluid.layers.get_places()
    pd = fluid.layers.ParallelDo(places, use_nccl=True)

    with pd.do():
        image_ = pd.read_input(image)
        label_ = pd.read_input(label)
        out = SE_ResNeXt(input=image_, class_dim=class_dim)
        cost = fluid.layers.cross_entropy(input=out, label=label_)
        avg_cost = fluid.layers.mean(x=cost)
        accuracy = fluid.layers.accuracy(input=out, label=label_)
        pd.write_output(avg_cost)
        pd.write_output(accuracy)

    avg_cost, accuracy = pd()
    avg_cost = fluid.layers.mean(x=avg_cost)
    # accuracy = fluid.layers.mean(x=accuracy)

    add_optimizer(args, avg_cost)

    place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    train_reader = paddle.batch(
        train() if args.use_fake_reader else flowers.train(),
        batch_size=args.batch_size)

    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
    train_reader_iter = train_reader()
    if args.fix_data_in_gpu:
        data = train_reader_iter.next()
        feed_data = feeder.feed(data)

    time_record = []
    img_count = 0
    train_start = time.time()

    for batch_id in range(args.number_iteration):
        if args.do_profile and batch_id >= 5 and batch_id < 8:
            with profiler.profiler('All', 'total',
                                   '/tmp/profile_parallel_do') as prof:
                exe.run(fluid.default_main_program(),
                        feed=feed_data if args.fix_data_in_gpu else
                        feeder.feed(train_reader_iter.next()),
                        fetch_list=[],
                        use_program_cache=True)
            continue

        cost_val = exe.run(fluid.default_main_program(),
                           feed=feed_data if args.fix_data_in_gpu else
                           feeder.feed(train_reader_iter.next()),
                           fetch_list=[avg_cost.name]
                           if (batch_id + 1) % args.display_step == 0 else [],
                           use_program_cache=True)

        img_count += args.batch_size

        if (batch_id + 1) % args.display_step == 0:
            train_stop = time.time()
            step_time = train_stop - train_start
            time_record.append(step_time)

            print("iter=%d, cost=%s, elapse=%f, img/sec=%f" %
                  ((batch_id + 1), np.array(cost_val[0]), step_time,
                   img_count / step_time))

            img_count = 0
            train_start = time.time()

    skip_time_record = args.skip_first_steps / args.display_step
    time_record[0:skip_time_record] = []

    if args.show_record_time:
        for i, ele in enumerate(time_record):
            print("iter:{0}, time consume:{1}".format(i, ele))

    img_count = (
        args.number_iteration - args.skip_first_steps) * args.batch_size

    print("average time:{0}, img/sec:{1}".format(
        np.mean(time_record), img_count / np.sum(time_record)))
Beispiel #28
0
def profile_context(profile=True):
    if profile:
        with profiler.profiler('All', 'total', '/tmp/profile_file2'):
            yield
    else:
        yield
Beispiel #29
0
def train(args, config, train_file_list, optimizer_method):
    learning_rate = args.learning_rate
    batch_size = args.batch_size
    height = args.resize_h
    width = args.resize_w
    use_gpu = args.use_gpu
    use_pyramidbox = args.use_pyramidbox
    model_save_dir = args.model_save_dir
    pretrained_model = args.pretrained_model
    num_iterations = args.num_iteration
    parallel = args.parallel

    num_classes = 2
    image_shape = [3, height, width]

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        py_reader = fluid.layers.py_reader(
            capacity=8,
            shapes=[[-1] + image_shape, [-1, 4], [-1, 4], [-1, 1]],
            lod_levels=[0, 1, 1, 1],
            dtypes=["float32", "float32", "float32", "int32"],
            use_double_buffer=True)
        with fluid.unique_name.guard():
            image, face_box, head_box, gt_label = fluid.layers.read_file(
                py_reader)
            fetches = []
            network = PyramidBox(image=image,
                                 face_box=face_box,
                                 head_box=head_box,
                                 gt_label=gt_label,
                                 sub_network=use_pyramidbox)
            if use_pyramidbox:
                face_loss, head_loss, loss = network.train()
                fetches = [face_loss, head_loss]
            else:
                loss = network.vgg_ssd_loss()
                fetches = [loss]
            devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
            devices_num = len(devices.split(","))
            batch_size_per_device = batch_size // devices_num
            steps_per_pass = 12880 // batch_size
            boundaries = [
                steps_per_pass * 50, steps_per_pass * 80, steps_per_pass * 120,
                steps_per_pass * 140
            ]
            values = [
                learning_rate, learning_rate * 0.5, learning_rate * 0.25,
                learning_rate * 0.1, learning_rate * 0.01
            ]
            if optimizer_method == "momentum":
                optimizer = fluid.optimizer.Momentum(
                    learning_rate=fluid.layers.piecewise_decay(
                        boundaries=boundaries, values=values),
                    momentum=0.9,
                    regularization=fluid.regularizer.L2Decay(0.0005),
                )
            else:
                optimizer = fluid.optimizer.RMSProp(
                    learning_rate=fluid.layers.piecewise_decay(
                        boundaries, values),
                    regularization=fluid.regularizer.L2Decay(0.0005),
                )
            optimizer.minimize(loss)

    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    start_pass = 0
    if pretrained_model:
        if pretrained_model.isdigit():
            start_pass = int(pretrained_model) + 1
            pretrained_model = os.path.join(model_save_dir, pretrained_model)
            print("Resume from %s " % (pretrained_model))

        if not os.path.exists(pretrained_model):
            raise ValueError(
                "The pre-trained model path [%s] does not exist." %
                (pretrained_model))

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)

    if parallel:
        train_exe = fluid.ParallelExecutor(use_cuda=use_gpu,
                                           loss_name=loss.name,
                                           main_program=train_prog)
    train_reader = reader.train(config,
                                train_file_list,
                                batch_size_per_device,
                                shuffle=False,
                                use_multiprocessing=True,
                                num_workers=8,
                                max_queue=24)
    py_reader.decorate_paddle_reader(train_reader)

    def run(iterations):
        # global feed_data
        py_reader.start()
        run_time = []
        for batch_id in range(iterations):
            start_time = time.time()
            if parallel:
                fetch_vars = train_exe.run(
                    fetch_list=[v.name for v in fetches])
            else:
                fetch_vars = exe.run(train_prog, fetch_list=fetches)
            end_time = time.time()
            run_time.append(end_time - start_time)
            fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
            if not args.use_pyramidbox:
                print("Batch {0}, loss {1}".format(batch_id, fetch_vars[0]))
            else:
                print("Batch {0}, face loss {1}, head loss {2}".format(
                    batch_id, fetch_vars[0], fetch_vars[1]))
        return run_time

    # start-up
    run(2)

    # profiling
    start = time.time()
    if not parallel:
        with profiler.profiler('All', 'total', '/tmp/profile_file'):
            run_time = run(num_iterations)
    else:
        run_time = run(num_iterations)
    end = time.time()
    total_time = end - start
    print("Total time: {0}, reader time: {1} s, run time: {2} s".format(
        total_time, total_time - np.sum(run_time), np.sum(run_time)))