コード例 #1
0
ファイル: emlloss.py プロジェクト: wangye707/paddle-tutorial
 def __init__(self, train_batch_size=40, samples_each_class=2):
     self.samples_each_class = samples_each_class
     self.train_batch_size = train_batch_size
     num_gpus = get_gpu_num()
     assert (train_batch_size % num_gpus == 0)
     self.cal_loss_batch_size = train_batch_size // num_gpus
     assert (self.cal_loss_batch_size % samples_each_class == 0)
コード例 #2
0
 def __init__(self,
              train_batch_size=160,
              samples_each_class=2,
              reg_lambda=0.01):
     self.samples_each_class = samples_each_class
     assert (self.samples_each_class == 2)
     self.train_batch_size = train_batch_size
     num_gpus = get_gpu_num()
     assert (train_batch_size % num_gpus == 0)
     self.cal_loss_batch_size = train_batch_size // num_gpus
     assert (self.cal_loss_batch_size % samples_each_class == 0)
     self.reg_lambda = reg_lambda
コード例 #3
0
def train_async(args):
    # parameters from arguments

    logging.debug('enter train')
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    model_save_dir = args.model_save_dir

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    tmp_prog = fluid.Program()

    train_loader, train_cost, train_acc1, train_acc5, global_lr = build_program(
        main_prog=train_prog, startup_prog=startup_prog, args=args)

    train_fetch_list = [
        global_lr.name, train_cost.name, train_acc1.name, train_acc5.name
    ]

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    if num_trainers <= 1 and args.use_gpu:
        places = fluid.framework.cuda_places()
    else:
        places = place
    exe.run(startup_prog)

    logging.debug('after run startup program')

    if checkpoint is not None:
        fluid.load(program=train_prog, model_path=checkpoint, executor=exe)

    if pretrained_model:
        load_pretrain(train_prog, pretrained_model)

    if args.use_gpu:
        devicenum = get_gpu_num()
    else:
        devicenum = 1 
    assert (args.train_batch_size % devicenum) == 0
    train_batch_size = args.train_batch_size // devicenum

    train_loader.set_sample_generator(
        reader.train(args),
        batch_size=train_batch_size,
        drop_last=True,
        places=places)

    train_exe = fluid.ParallelExecutor(
        main_program=train_prog,
        use_cuda=args.use_gpu,
        loss_name=train_cost.name)

    totalruntime = 0
    iter_no = 0
    train_info = [0, 0, 0, 0]
    while iter_no <= args.total_iter_num:
        for train_batch in train_loader():
            t1 = time.time()
            lr, loss, acc1, acc5 = train_exe.run(feed=train_batch,
                                                 fetch_list=train_fetch_list)
            t2 = time.time()
            period = t2 - t1
            lr = np.mean(np.array(lr))
            train_info[0] += np.mean(np.array(loss))
            train_info[1] += np.mean(np.array(acc1))
            train_info[2] += np.mean(np.array(acc5))
            train_info[3] += 1
            if iter_no % args.display_iter_step == 0:
                avgruntime = totalruntime / args.display_iter_step
                avg_loss = train_info[0] / train_info[3]
                avg_acc1 = train_info[1] / train_info[3]
                avg_acc5 = train_info[2] / train_info[3]
                print("[%s] trainbatch %d, lr %.6f, loss %.6f, "\
                    "acc1 %.4f, acc5 %.4f, time %2.2f sec" % \
                    (fmt_time(), iter_no, lr, avg_loss, avg_acc1, avg_acc5, avgruntime))
                sys.stdout.flush()
                totalruntime = 0
            if iter_no % args.display_iter_step == 0:
                train_info = [0, 0, 0, 0]

            totalruntime += period

            if iter_no % args.save_iter_step == 0 and iter_no != 0:
                model_path = os.path.join(model_save_dir + '/' + model_name,
                                          str(iter_no))
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                fluid.save(program=train_prog, model_path=model_path)

            iter_no += 1
コード例 #4
0
def train_async(args):
    # parameters from arguments

    logging.debug('enter train')
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    model_save_dir = args.model_save_dir
    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)
    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    tmp_prog = fluid.Program()

    train_loader, train_cost, global_lr, train_feas, train_label = build_program(
        is_train=True,
        main_prog=train_prog,
        startup_prog=startup_prog,
        args=args)
    test_loader, test_feas = build_program(is_train=False,
                                           main_prog=tmp_prog,
                                           startup_prog=startup_prog,
                                           args=args)
    test_prog = tmp_prog.clone(for_test=True)

    train_fetch_list = [
        global_lr.name, train_cost.name, train_feas.name, train_label.name
    ]
    test_fetch_list = [test_feas.name]

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
    if num_trainers <= 1 and args.use_gpu:
        places = fluid.framework.cuda_places()
    else:
        places = place

    exe.run(startup_prog)

    if checkpoint is not None:
        fluid.load(program=train_prog, model_path=checkpoint, executor=exe)

    if pretrained_model:
        load_params(exe, train_prog, pretrained_model)

    if args.use_gpu:
        devicenum = get_gpu_num()
    else:
        devicenum = int(os.environ.get('CPU_NUM', 1))
    assert (args.train_batch_size % devicenum) == 0
    train_batch_size = args.train_batch_size / devicenum
    test_batch_size = args.test_batch_size

    train_loader.set_sample_generator(reader.train(args),
                                      batch_size=train_batch_size,
                                      drop_last=True,
                                      places=places)

    test_loader.set_sample_generator(reader.test(args),
                                     batch_size=test_batch_size,
                                     drop_last=False,
                                     places=place)

    train_exe = fluid.ParallelExecutor(main_program=train_prog,
                                       use_cuda=args.use_gpu,
                                       loss_name=train_cost.name)

    totalruntime = 0
    iter_no = 0
    train_info = [0, 0, 0]
    while iter_no <= args.total_iter_num:
        for train_batch in train_loader():
            t1 = time.time()
            lr, loss, feas, label = train_exe.run(feed=train_batch,
                                                  fetch_list=train_fetch_list)
            t2 = time.time()
            period = t2 - t1
            lr = np.mean(np.array(lr))
            train_info[0] += np.mean(np.array(loss))
            train_info[1] += recall_topk(feas, label, k=1)
            train_info[2] += 1
            if iter_no % args.display_iter_step == 0:
                avgruntime = totalruntime / args.display_iter_step
                avg_loss = train_info[0] / train_info[2]
                avg_recall = train_info[1] / train_info[2]
                print("[%s] trainbatch %d, lr %.6f, loss %.6f, "\
                    "recall %.4f, time %2.2f sec" % \
                    (fmt_time(), iter_no, lr, avg_loss, avg_recall, avgruntime))
                sys.stdout.flush()
                totalruntime = 0
            if iter_no % 1000 == 0:
                train_info = [0, 0, 0]

            totalruntime += period

            if iter_no % args.test_iter_step == 0 and iter_no != 0:
                f, l = [], []
                for batch_id, test_batch in enumerate(test_loader()):
                    t1 = time.time()
                    [feas] = exe.run(test_prog,
                                     feed=test_batch,
                                     fetch_list=test_fetch_list)

                    label = np.asarray(test_batch[0]['label'])
                    label = np.squeeze(label)
                    f.append(feas)
                    l.append(label)

                    t2 = time.time()
                    period = t2 - t1
                    if batch_id % 20 == 0:
                        print("[%s] testbatch %d, time %2.2f sec" % \
                            (fmt_time(), batch_id, period))

                f = np.vstack(f)
                l = np.hstack(l)
                recall = recall_topk(f, l, k=1)
                print("[%s] test_img_num %d, trainbatch %d, test_recall %.5f" % \
                    (fmt_time(), len(f), iter_no, recall))
                sys.stdout.flush()

            if iter_no % args.save_iter_step == 0 and iter_no != 0:
                model_path = os.path.join(model_save_dir, model_name,
                                          str(iter_no))
                fluid.save(program=train_prog, model_path=model_path)

            iter_no += 1
コード例 #5
0
def train_async(args):
    # parameters from arguments

    logging.debug('enter train')
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    model_save_dir = args.model_save_dir

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    tmp_prog = fluid.Program()

    if args.enable_ce:
        assert args.model == "ResNet50"
        assert args.loss_name == "arcmargin"
        np.random.seed(0)
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000
        tmp_prog.random_seed = 1000

    train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program(
        is_train=True,
        main_prog=train_prog,
        startup_prog=startup_prog,
        args=args)
    test_feas, image, label = build_program(is_train=False,
                                            main_prog=tmp_prog,
                                            startup_prog=startup_prog,
                                            args=args)
    test_prog = tmp_prog.clone(for_test=True)

    train_fetch_list = [
        global_lr.name, train_cost.name, train_acc1.name, train_acc5.name
    ]
    test_fetch_list = [test_feas.name]

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    exe.run(startup_prog)

    logging.debug('after run startup program')

    if checkpoint is not None:
        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)

    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe,
                           pretrained_model,
                           main_program=train_prog,
                           predicate=if_exist)

    if args.use_gpu:
        devicenum = get_gpu_num()
    else:
        devicenum = int(os.environ.get('CPU_NUM', 1))
    assert (args.train_batch_size % devicenum) == 0
    train_batch_size = args.train_batch_size // devicenum
    test_batch_size = args.test_batch_size

    train_reader = paddle.batch(reader.train(args),
                                batch_size=train_batch_size,
                                drop_last=True)
    test_reader = paddle.batch(reader.test(args),
                               batch_size=test_batch_size,
                               drop_last=False)
    test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
    train_py_reader.decorate_paddle_reader(train_reader)

    train_exe = fluid.ParallelExecutor(main_program=train_prog,
                                       use_cuda=args.use_gpu,
                                       loss_name=train_cost.name)

    totalruntime = 0
    train_py_reader.start()
    iter_no = 0
    train_info = [0, 0, 0, 0]
    while iter_no <= args.total_iter_num:
        t1 = time.time()
        lr, loss, acc1, acc5 = train_exe.run(fetch_list=train_fetch_list)
        t2 = time.time()
        period = t2 - t1
        lr = np.mean(np.array(lr))
        train_info[0] += np.mean(np.array(loss))
        train_info[1] += np.mean(np.array(acc1))
        train_info[2] += np.mean(np.array(acc5))
        train_info[3] += 1
        if iter_no % args.display_iter_step == 0:
            avgruntime = totalruntime / args.display_iter_step
            avg_loss = train_info[0] / train_info[3]
            avg_acc1 = train_info[1] / train_info[3]
            avg_acc5 = train_info[2] / train_info[3]
            print("[%s] trainbatch %d, lr %.6f, loss %.6f, "\
                    "acc1 %.4f, acc5 %.4f, time %2.2f sec" % \
                    (fmt_time(), iter_no, lr, avg_loss, avg_acc1, avg_acc5, avgruntime))
            sys.stdout.flush()
            totalruntime = 0
        if iter_no % 1000 == 0:
            train_info = [0, 0, 0, 0]

        totalruntime += period

        if iter_no % args.test_iter_step == 0 and iter_no != 0:
            f, l = [], []
            for batch_id, data in enumerate(test_reader()):
                t1 = time.time()
                [feas] = exe.run(test_prog,
                                 fetch_list=test_fetch_list,
                                 feed=test_feeder.feed(data))
                label = np.asarray([x[1] for x in data])
                f.append(feas)
                l.append(label)

                t2 = time.time()
                period = t2 - t1
                if batch_id % 20 == 0:
                    print("[%s] testbatch %d, time %2.2f sec" % \
                            (fmt_time(), batch_id, period))

            f = np.vstack(f)
            l = np.hstack(l)
            recall = recall_topk(f, l, k=1)
            print("[%s] test_img_num %d, trainbatch %d, test_recall %.5f" % \
                    (fmt_time(), len(f), iter_no, recall))
            sys.stdout.flush()

        if iter_no % args.save_iter_step == 0 and iter_no != 0:
            model_path = os.path.join(model_save_dir + '/' + model_name,
                                      str(iter_no))
            if not os.path.isdir(model_path):
                os.makedirs(model_path)
            fluid.io.save_persistables(exe,
                                       model_path,
                                       main_program=train_prog)

        iter_no += 1

    # This is for continuous evaluation only
    if args.enable_ce:
        # Use the mean cost/acc for training
        print("kpis\ttrain_cost\t{}".format(avg_loss))
        print("kpis\ttest_recall\t{}".format(recall))
コード例 #6
0
def train_async(args):
    # parameters from arguments

    logging.debug('enter train')
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    model_save_dir = args.model_save_dir

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    tmp_prog = fluid.Program()

    #测试使用,固定随机参数种子
    if args.enable_ce:
        assert args.model == "ResNet50"
        assert args.loss_name == "arcmargin"
        np.random.seed(0)
        startup_prog.random_seed = 1000
        train_prog.random_seed = 1000
        tmp_prog.random_seed = 1000

    trainclassify = args.loss_name in ["softmax", "arcmargin"]
    train_py_reader, outputvars = build_program(is_train=True,
                                                net_config=net_config_classify,
                                                main_prog=train_prog,
                                                startup_prog=startup_prog,
                                                args=args)

    if trainclassify:
        train_cost, train_acc1, train_acc5, global_lr = outputvars
        train_fetch_list = [
            global_lr.name, train_cost.name, train_acc1.name, train_acc5.name
        ]
        evaltrain = EvalTrain_Classify()

    else:
        train_cost, train_feas, train_label, global_lr = outputvars
        train_fetch_list = [
            global_lr.name, train_cost.name, train_feas.name, train_label.name
        ]
        evaltrain = EvalTrain_Metric()

    _, outputvars = build_program(is_train=False,
                                  net_config=net_config_test,
                                  main_prog=tmp_prog,
                                  startup_prog=startup_prog,
                                  args=args)
    test_feas, image, label = outputvars
    test_prog = tmp_prog.clone(for_test=True)

    test_fetch_list = [test_feas.name]

    #打开内存优化,可以节省显存使用(注意,取出的变量要使用skip_opt_set设置一下,否则有可能被优化覆写)
    if args.with_mem_opt:
        fluid.memory_optimize(train_prog, skip_opt_set=set(train_fetch_list))

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    #初始化变量
    exe.run(startup_prog)

    logging.debug('after run startup program')

    #从断点中恢复
    if checkpoint is not None:
        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)

    #加载预训练模型的参数到网络。如果使用预训练模型,最后一层fc需要改一下名字,或者删掉预训练模型的fc对应的权值文件
    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe,
                           pretrained_model,
                           main_program=train_prog,
                           predicate=if_exist)

    #得到机器gpu卡数。
    #
    if args.use_gpu:
        devicenum = get_gpu_num()
        assert (args.train_batch_size % devicenum) == 0
    else:
        devicenum = get_cpu_num()
        assert (args.train_batch_size % devicenum) == 0
    #注意: 使用py_reader 的输入的batch大小,是单卡的batch大小,所以要除一下
    train_batch_size = args.train_batch_size // devicenum
    test_batch_size = args.test_batch_size

    logging.debug('device number is %d, batch on each card:%d', devicenum,
                  train_batch_size)

    #创建新的train_reader 将输入的reader读入的数据组成batch 。另外将train_reader 连接到 pyreader,由pyreader创建的线程主动读取,不在主线程调用。
    train_reader = paddle.batch(reader.train(args),
                                batch_size=train_batch_size,
                                drop_last=True)
    test_reader = paddle.batch(reader.test(args),
                               batch_size=test_batch_size,
                               drop_last=False)
    test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
    train_py_reader.decorate_paddle_reader(train_reader)

    #使用ParallelExecutor 实现多卡训练
    train_exe = fluid.ParallelExecutor(main_program=train_prog,
                                       use_cuda=args.use_gpu,
                                       loss_name=train_cost.name)

    totalruntime = 0
    #启动pyreader的读取线程
    train_py_reader.start()
    iter_no = 0
    while iter_no <= args.total_iter_num:
        t1 = time.time()
        #注意对于pyreader异步读取,不需要传入feed 参数了
        outputlist = train_exe.run(fetch_list=train_fetch_list)
        t2 = time.time()
        period = t2 - t1

        evaltrain.pushdata(outputlist)

        #计算多个batch的平均准确率
        if iter_no % args.display_iter_step == 0:
            avgruntime = totalruntime / args.display_iter_step
            train_accuracy = evaltrain.getaccuracy()

            print("[%s] trainbatch %d, "\
                    "accuracy[%s], time %2.2f sec" % \
                    (fmt_time(), iter_no, train_accuracy, avgruntime))
            sys.stdout.flush()
            totalruntime = 0
        if iter_no % 1000 == 0:
            evaltrain.reset()

        totalruntime += period

        if iter_no % args.test_iter_step == 0 and (pretrained_model or
                                                   checkpoint or iter_no != 0):
            #保持多个batch的feature 和 label 分别到 f, l
            evaltest = EvalTest()
            max_test_count = 100
            for batch_id, data in enumerate(test_reader()):
                t1 = time.time()
                test_outputlist = exe.run(test_prog,
                                          fetch_list=test_fetch_list,
                                          feed=test_feeder.feed(data))

                label = np.asarray([x[1] for x in data])

                evaltest.pushdata((test_outputlist[0], label))
                t2 = time.time()
                period = t2 - t1
                if batch_id % 20 == 0:
                    print("[%s] testbatch %d, time %2.2f sec" % \
                            (fmt_time(), batch_id, period))
                if batch_id > max_test_count:
                    break
            #测试检索的准确率,当query和检索结果类别一致,检索正确。(这里测试数据集类别与训练数据集类别不重叠,因此网络输出的类别没有意义)
            test_recall = evaltest.getaccuracy()
            print("[%s] test_img_num %d, trainbatch %d, testaccarcy %s" % \
                    (fmt_time(), max_test_count * args.test_batch_size, iter_no, test_recall))
            sys.stdout.flush()

        if iter_no % args.save_iter_step == 0 and iter_no != 0:
            model_path = os.path.join(model_save_dir + '/' + model_name,
                                      str(iter_no))
            if not os.path.isdir(model_path):
                os.makedirs(model_path)
            #保存模型, 可用于训练断点恢复
            fluid.io.save_persistables(exe,
                                       model_path,
                                       main_program=train_prog)

        iter_no += 1

    # This is for continuous evaluation only
    if args.enable_ce:
        # Use the mean cost/acc for training
        print("kpis train_cost      %s" % (avg_loss))
        print("kpis test_recall     %s" % (recall))