Beispiel #1
0
    def test_save_load_same_result(self):
        x = np.random.randn(30, 10, 32).astype('float32')
        weight = np.random.randn(32, 64).astype('float32')
        with fluid.dygraph.guard(place):
            dygraph_result = simple_func(x, weight)

        main_program, startup_program, inputs, outputs = decorated_simple_func(
            x, weight)
        exe = fluid.Executor(place)
        exe.run(startup_program)
        fluid.save(main_program, "./test_dy2stat_save_load")

        # set vars to zero so that we can test load in same file
        for var in main_program.list_vars():
            if isinstance(var, framework.Parameter) or var.persistable:
                tensor = fluid.global_scope().find_var(var.name).get_tensor()
                tensor.set(np.zeros_like(np.array(tensor)), place)

                # make sure all the paramerter or optimizer var have been set to zero
                tensor_np = np.array(fluid.global_scope().find_var(
                    var.name).get_tensor())
                self.assertEqual(0, np.sum(np.abs(tensor_np)))

        fluid.load(main_program, "./test_dy2stat_save_load")
        static_result = exe.run(main_program,
                                feed={inputs[0].name: x},
                                fetch_list=outputs)
        self.assertTrue(np.allclose(dygraph_result.numpy(), static_result))
Beispiel #2
0
def checkpoints(epoch, cfg, trainer, name):
    output_path = os.path.join(cfg.output, 'checkpoints', str(epoch))
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    fluid.save(trainer.program, os.path.join(output_path, name))
    print('save checkpoints {} to {}'.format(name, output_path))
    sys.stdout.flush()
Beispiel #3
0
    def save_model(self, save_dir):
        if not osp.isdir(save_dir):
            if osp.exists(save_dir):
                os.remove(save_dir)
            os.makedirs(save_dir)
        if self.train_prog is not None:
            fluid.save(self.train_prog, osp.join(save_dir, 'model'))
        else:
            fluid.save(self.test_prog, osp.join(save_dir, 'model'))
        model_info = self.get_model_info()
        model_info['status'] = self.status
        with open(osp.join(save_dir, 'model.yml'), encoding='utf-8',
                  mode='w') as f:
            yaml.dump(model_info, f)
        # 评估结果保存
        if hasattr(self, 'eval_details'):
            with open(osp.join(save_dir, 'eval_details.json'), 'w') as f:
                json.dump(self.eval_details, f)

        if self.status == 'Prune':
            # 保存裁剪的shape
            shapes = {}
            for block in self.train_prog.blocks:
                for param in block.all_parameters():
                    pd_var = fluid.global_scope().find_var(param.name)
                    pd_param = pd_var.get_tensor()
                    shapes[param.name] = np.array(pd_param).shape
            with open(osp.join(save_dir, 'prune.yml'),
                      encoding='utf-8',
                      mode='w') as f:
                yaml.dump(shapes, f)

        # 模型保存成功的标志
        open(osp.join(save_dir, '.success'), 'w').close()
        logging.info("Model saved in {}.".format(save_dir))
Beispiel #4
0
def train_loop(args, train_program, reader, data_loader, loss, trainer_id,
               weight):

    data_loader.set_batch_generator(
        convert_python_to_tensor(weight, args.batch_size, reader.train()))

    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    print("CPU_NUM:" + str(os.getenv("CPU_NUM")))

    train_exe = exe

    for pass_id in range(args.num_passes):
        data_loader.start()
        time.sleep(10)
        epoch_start = time.time()
        batch_id = 0
        start = time.time()
        try:
            while True:

                loss_val = train_exe.run(fetch_list=[loss.name])
                loss_val = np.mean(loss_val)

                if batch_id % args.print_batch == 0:
                    logger.info(
                        "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}"
                        .format(pass_id, batch_id, loss_val.mean(),
                                data_loader.queue.size()))
                if args.with_speed:
                    if batch_id % 500 == 0 and batch_id != 0:
                        elapsed = (time.time() - start)
                        start = time.time()
                        samples = 1001 * args.batch_size * int(
                            os.getenv("CPU_NUM"))
                        logger.info("Time used: {}, Samples/Sec: {}".format(
                            elapsed, samples / elapsed))

                if batch_id % args.save_step == 0 and batch_id != 0:
                    model_dir = args.model_output_dir + '/pass-' + str(
                        pass_id) + ('/batch-' + str(batch_id))
                    if trainer_id == 0:
                        fluid.save(fluid.default_main_program(),
                                   model_path=model_dir)
                        print("model saved in %s" % model_dir)
                batch_id += 1

        except fluid.core.EOFException:
            data_loader.reset()
            epoch_end = time.time()
            logger.info("Epoch: {0}, Train total expend: {1} ".format(
                pass_id, epoch_end - epoch_start))
            model_dir = args.model_output_dir + '/pass-' + str(pass_id)
            if trainer_id == 0:
                fluid.save(fluid.default_main_program(), model_path=model_dir)
                print("model saved in %s" % model_dir)
Beispiel #5
0
def save_model(exe, program, save_dir, model_name, postfix=''):
    """save paramters and optimizer related varaibles"""
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    saved_model_name = model_name + postfix

    fluid.save(program, os.path.join(save_dir, saved_model_name))

    return
Beispiel #6
0
def save_model(program, model_path, epoch_id, prefix='ppcls'):
    """
    save model to the target path
    """
    model_path = os.path.join(model_path, str(epoch_id))
    _mkdir_if_not_exist(model_path)
    model_prefix = os.path.join(model_path, prefix)
    fluid.save(program, model_prefix)
    logger.info("Already save model in {}".format(model_path))
Beispiel #7
0
def train_loop(main_program, avg_cost, acc, train_input_data, place, args,
               train_reader):
    data_list = [var.name for var in train_input_data]
    feeder = fluid.DataFeeder(feed_list=data_list, place=place)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    train_exe = exe

    total_time = 0.0
    ce_info = []
    for pass_id in range(args.epochs):
        epoch_idx = pass_id + 1
        print("epoch_%d start" % epoch_idx)
        t0 = time.time()
        i = 0
        for batch_id, data in enumerate(train_reader()):
            i += 1
            loss_val, correct_val = train_exe.run(
                feed=feeder.feed(data), fetch_list=[avg_cost.name, acc.name])
            ce_info.append(float(np.mean(correct_val)) / args.batch_size)
            if i % args.print_batch == 0:
                logger.info(
                    "Train --> pass: {} batch_id: {} avg_cost: {}, acc: {}".
                    format(pass_id, batch_id, np.mean(loss_val),
                           float(np.mean(correct_val)) / args.batch_size))
            if args.enable_ce and i > args.step_num:
                break
        t1 = time.time()
        total_time += t1 - t0
        print("epoch:%d num_steps:%d time_cost(s):%f" %
              (epoch_idx, i, total_time / epoch_idx))
        save_dir = "%s/epoch_%d" % (args.model_dir, epoch_idx)
        fluid.save(fluid.default_main_program(), save_dir)
        print("model saved in %s" % save_dir)

    # only for ce
    if args.enable_ce:
        ce_acc = 0
        try:
            ce_acc = ce_info[-2]
        except:
            print("ce info error")
        epoch_idx = args.epochs
        device = get_device(args)
        if args.use_cuda:
            gpu_num = device[1]
            print("kpis\teach_pass_duration_gpu%s\t%s" %
                  (gpu_num, total_time / epoch_idx))
            print("kpis\ttrain_acc_gpu%s\t%s" % (gpu_num, ce_acc))
        else:
            cpu_num = device[1]
            threads_num = device[2]
            print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" %
                  (cpu_num, threads_num, total_time / epoch_idx))
            print("kpis\ttrain_acc_cpu%s_thread%s\t%s" %
                  (cpu_num, threads_num, ce_acc))
def save(exe, prog, path):
    """
    Load model from the given path.
    Args:
        exe (fluid.Executor): The fluid.Executor object.
        prog (fluid.Program): save weight from which Program object.
        path (string): the path to save model.
    """
    if os.path.isdir(path):
        shutil.rmtree(path)
    logger.info('Save model to {}.'.format(path))
    fluid.save(prog, path)
Beispiel #9
0
def save_checkpoint(program, ckpt_name):
    """
    Save checkpoint for evaluation or resume training
    """
    ckpt_dir = os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, str(ckpt_name))
    print("Save model checkpoint to {}".format(ckpt_dir))
    if not os.path.isdir(ckpt_dir):
        os.makedirs(ckpt_dir)

    fluid.save(program, os.path.join(ckpt_dir, 'model'))

    return ckpt_dir
Beispiel #10
0
def train():
    args = parse_args()
    # add ce
    if args.enable_ce:
        SEED = 102
        fluid.default_main_program().random_seed = SEED
        fluid.default_startup_program().random_seed = SEED

    print(args)
    if not os.path.isdir(args.model_output_dir):
        os.mkdir(args.model_output_dir)

    loss, auc, data_list, auc_states = eval('network_conf.' + args.model_name)(
        args.embedding_size, args.num_field, args.num_feat,
        args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin)
    optimizer = fluid.optimizer.SGD(
        learning_rate=args.lr,
        regularization=fluid.regularizer.L2DecayRegularizer(args.reg))
    optimizer.minimize(loss)

    dataset = fluid.DatasetFactory().create_dataset()
    dataset.set_use_var(data_list)
    dataset.set_pipe_command('python criteo_reader.py')
    dataset.set_batch_size(args.batch_size)
    dataset.set_filelist([
        os.path.join(args.train_data_dir, x)
        for x in os.listdir(args.train_data_dir)
    ])

    if args.use_gpu == 1:
        exe = fluid.Executor(fluid.CUDAPlace(0))
        dataset.set_thread(1)
    else:
        exe = fluid.Executor(fluid.CPUPlace())
        dataset.set_thread(args.num_thread)
    exe.run(fluid.default_startup_program())

    for epoch_id in range(args.num_epoch):
        start = time.time()
        sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1))
        exe.train_from_dataset(program=fluid.default_main_program(),
                               dataset=dataset,
                               fetch_list=[loss, auc],
                               fetch_info=['loss', 'auc'],
                               debug=False,
                               print_period=args.print_steps)
        model_dir = os.path.join(args.model_output_dir,
                                 'epoch_' + str(epoch_id + 1), "checkpoint")
        sys.stderr.write('epoch%d is finished and takes %f s\n' %
                         ((epoch_id + 1), time.time() - start))
        fluid.save(fluid.default_main_program(), model_dir)
Beispiel #11
0
def save_checkpoint(program, ckpt_name, epoch=0):
    """
    Save checkpoint for evaluation or resume training
    """
    ckpt_dir = os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, str(ckpt_name))
    print("Save model checkpoint to {}".format(ckpt_dir))
    if not os.path.isdir(ckpt_dir):
        os.makedirs(ckpt_dir)
    epoch_file = os.path.join(ckpt_dir, 'epoch.txt')
    with open(epoch_file, 'w') as f:
        f.write('{}'.format(epoch))
    fluid.save(program, os.path.join(ckpt_dir, 'model'))

    return ckpt_dir
Beispiel #12
0
    def train_loop(main_program):
        """ train network """
        start_time = time.time()
        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_use_var(dcn_model.data_list)
        pipe_command = 'python reader.py {}'.format(args.vocab_dir)
        dataset.set_pipe_command(pipe_command)
        dataset.set_batch_size(args.batch_size)
        dataset.set_thread(args.num_thread)
        train_filelist = [
            os.path.join(args.train_data_dir, fname)
            for fname in next(os.walk(args.train_data_dir))[2]
        ]
        dataset.set_filelist(train_filelist)

        if args.use_gpu == 1:
            exe = fluid.Executor(fluid.CUDAPlace(0))
            dataset.set_thread(1)
        else:
            exe = fluid.Executor(fluid.CPUPlace())
            dataset.set_thread(args.num_thread)
        exe.run(fluid.default_startup_program())

        for epoch_id in range(args.num_epoch):
            start = time.time()
            sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1))
            exe.train_from_dataset(
                program=main_program,
                dataset=dataset,
                fetch_list=[
                    dcn_model.loss, dcn_model.avg_logloss, dcn_model.auc_var
                ],
                fetch_info=['total_loss', 'avg_logloss', 'auc'],
                debug=False,
                print_period=args.print_steps)
            model_dir = os.path.join(args.model_output_dir,
                                     'epoch_' + str(epoch_id + 1),
                                     "checkpoint")
            sys.stderr.write('epoch%d is finished and takes %f s\n' %
                             ((epoch_id + 1), time.time() - start))
            if args.trainer_id == 0:  # only trainer 0 save model
                print("save model in {}".format(model_dir))
                fluid.save(main_program, model_dir)

        print("train time cost {:.4f}".format(time.time() - start_time))
        print("finish training")
Beispiel #13
0
def save_model(exe,
               program,
               save_dir,
               model_name,
               postfix=None,
               save_type='.pdckpt'):
    """
    save_type: '.pdckpt' or '.pdparams', '.pdckpt' for all persistable variables,
               '.pdparams' for parameters only
    """
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    saved_model_name = model_name + postfix

    fluid.save(program, os.path.join(save_dir, saved_model_name))

    return
Beispiel #14
0
def save_model(exe, program, save_dir, model_name, postfix=None):
    """save_model"""
    #model_path = os.path.join(save_dir, model_name + postfix)
    #if os.path.isdir(model_path):
    #    shutil.rmtree(model_path)
    ##fluid.io.save_persistables(exe, model_path, main_program=program)
    #save_vars = [x for x in program.list_vars() \
    #             if isinstance(x, fluid.framework.Parameter)]
    #fluid.io.save_vars(exe, dirname=model_path, main_program=program, vars=save_vars, filename="param")

    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)
    saved_model_name = model_name + postfix

    fluid.save(program, os.path.join(save_dir, saved_model_name))

    return
Beispiel #15
0
def local_train(args):
    # 引入模型的组网
    ctr_model = CTR()
    inputs = ctr_model.input_data(args)
    avg_cost, auc_var = ctr_model.net(inputs, args)

    # 选择反向更新优化策略
    optimizer = fluid.optimizer.Adam(args.learning_rate)
    optimizer.minimize(avg_cost)

    # 在CPU上创建训练的执行器并做参数初始化
    exe = fluid.Executor(fluid.CPUPlace())
    exe.run(fluid.default_startup_program())

    # 引入训练数据读取器与训练数据列表
    dataset, file_list = get_dataset(inputs, args)

    logger.info("Training Begin")
    for epoch in range(args.epochs):
        # 以文件为粒度进行shuffle
        random.shuffle(file_list)
        dataset.set_filelist(file_list)

        # 使用train_from_dataset实现多线程并发训练
        start_time = time.time()
        exe.train_from_dataset(program=fluid.default_main_program(),
                               dataset=dataset,
                               fetch_list=[auc_var],
                               fetch_info=["Epoch {} auc ".format(epoch)],
                               print_period=100,
                               debug=False)
        end_time = time.time()
        logger.info("epoch %d finished, use time=%d\n" %
                    ((epoch), end_time - start_time))

        if args.save_model:
            model_path = os.path.join(str(args.model_path),
                                      "epoch_" + str(epoch))
            if not os.path.isdir(model_path):
                os.mkdir(model_path)
            fluid.save(fluid.default_main_program(),
                       os.path.join(model_path, "checkpoint"))

    logger.info("Train Success!")
Beispiel #16
0
    def train_loop(main_program):
        """ train network """
        start_time = time.time()
        dataset = fluid.DatasetFactory().create_dataset()
        dataset.set_use_var(data_list)
        pipe_command = 'python criteo_reader.py {}'.format(args.feat_dict)
        dataset.set_pipe_command(pipe_command)
        dataset.set_batch_size(args.batch_size)
        dataset.set_thread(args.num_thread)
        train_filelist = [
            os.path.join(args.train_data_dir, x)
            for x in os.listdir(args.train_data_dir)
        ]

        if args.use_gpu == 1:
            exe = fluid.Executor(fluid.CUDAPlace(0))
            dataset.set_thread(1)
        else:
            exe = fluid.Executor(fluid.CPUPlace())
            dataset.set_thread(args.num_thread)
        exe.run(fluid.default_startup_program())

        for epoch_id in range(args.num_epoch):
            start = time.time()
            sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1))
            dataset.set_filelist(train_filelist)
            exe.train_from_dataset(
                program=main_program,
                dataset=dataset,
                fetch_list=[loss, auc],
                fetch_info=['epoch %d batch loss' % (epoch_id + 1), "auc"],
                print_period=5,
                debug=False)
            model_dir = os.path.join(args.model_output_dir,
                                     'epoch_' + str(epoch_id + 1))
            sys.stderr.write('epoch%d is finished and takes %f s\n' % (
                (epoch_id + 1), time.time() - start))
            if args.trainer_id == 0:  # only trainer 0 save model
                print("save model in {}".format(model_dir))
                fluid.save(main_program, model_dir)

        print("train time cost {:.4f}".format(time.time() - start_time))
        print("finish training")
Beispiel #17
0
    def save_model(self, save_dir):
        if not osp.isdir(save_dir):
            if osp.exists(save_dir):
                os.remove(save_dir)
            os.makedirs(save_dir)
        model_info = self.get_model_info()

        if self.status == 'Normal':
            fluid.save(self.train_prog, osp.join(save_dir, 'model'))
            save_infer_program(self.test_prog, save_dir)

        model_info['status'] = self.status
        with open(osp.join(save_dir, 'model.yml'), encoding='utf-8',
                  mode='w') as f:
            yaml.dump(model_info, f)

        # The flag of model for saving successfully
        open(osp.join(save_dir, '.success'), 'w').close()
        logging.info("Model saved in {}.".format(save_dir))
Beispiel #18
0
def train():
    if cfg.train_model == 'dnn':
        model = DNN()

    inputs = model.input_data()
    avg_cost, auc_var = model.net(inputs)

    optimizer = fluid.optimizer.Adam(cfg.learning_rate)
    optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if cfg.use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    dataset, file_list = get_dataset(inputs)

    logger.info("Training Begin")
    for epoch in range(cfg.epoches):
        random.shuffle(file_list)
        dataset.set_filelist(file_list)

        start_time = time.time()
        exe.train_from_dataset(
            program=fluid.default_main_program(),
            dataset=dataset,
            fetch_list=[avg_cost, auc_var],
            fetch_info=['Epoch {} cost: '.format(epoch + 1), ' - auc: '],
            print_period=cfg.log_interval,
            debug=False)
        end_time = time.time()
        logger.info("epoch %d finished, use time = %ds \n" %
                    ((epoch + 1), end_time - start_time))

        if (epoch + 1) % cfg.save_interval == 0:
            model_path = os.path.join(str(cfg.save_path), model.name,
                                      model.name + "_epoch_" + str(epoch + 1))
            if not os.path.isdir(model_path):
                os.makedirs(model_path)
            logger.info("saving model to %s \n" % (model_path))
            fluid.save(fluid.default_main_program(),
                       os.path.join(model_path, "checkpoint"))
    logger.info("Done.")
Beispiel #19
0
    def save_model(self, save_dir):
        if not osp.isdir(save_dir):
            if osp.exists(save_dir):
                os.remove(save_dir)
            os.makedirs(save_dir)
        model_info = self.get_model_info()

        if self.status == 'Normal':
            fluid.save(self.train_prog, osp.join(save_dir, 'model'))
        elif self.status == 'Quant':
            float_prog, _ = slim.quant.convert(self.test_prog,
                                               self.exe.place,
                                               save_int8=True)
            test_input_names = [
                var.name for var in list(self.test_inputs.values())
            ]
            test_outputs = list(self.test_outputs.values())
            fluid.io.save_inference_model(dirname=save_dir,
                                          executor=self.exe,
                                          params_filename='__params__',
                                          feeded_var_names=test_input_names,
                                          target_vars=test_outputs,
                                          main_program=float_prog)

            model_info['_ModelInputsOutputs'] = dict()
            model_info['_ModelInputsOutputs']['test_inputs'] = [
                [k, v.name] for k, v in self.test_inputs.items()
            ]
            model_info['_ModelInputsOutputs']['test_outputs'] = [
                [k, v.name] for k, v in self.test_outputs.items()
            ]

        model_info['status'] = self.status
        with open(osp.join(save_dir, 'model.yml'), encoding='utf-8',
                  mode='w') as f:
            yaml.dump(model_info, f)

        # The flag of model for saving successfully
        open(osp.join(save_dir, '.success'), 'w').close()
        logging.info("Model saved in {}.".format(save_dir))
Beispiel #20
0
    def test_ptb_rnn_cpu_bfloat16(self):
        seed = 90
        hidden_size = 10
        vocab_size = 500
        num_layers = 1
        num_steps = 3
        init_scale = 0.1
        batch_size = 4
        batch_num = 100

        with new_program_scope():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            ptb_model = PtbModel("ptb_model",
                                 hidden_size=hidden_size,
                                 vocab_size=vocab_size,
                                 num_layers=num_layers,
                                 num_steps=num_steps,
                                 init_scale=init_scale)

            place = self.set_place()
            exe = fluid.Executor(place)
            sgd = SGDOptimizer(learning_rate=1e-3)
            x = fluid.layers.data(name="x",
                                  shape=[-1, num_steps],
                                  dtype='int64')
            y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
            init_hidden = fluid.layers.data(name="init_hidden",
                                            shape=[1],
                                            dtype='float32')
            init_cell = fluid.layers.data(name="init_cell",
                                          shape=[1],
                                          dtype='float32')

            static_loss, static_last_hidden, static_last_cell = ptb_model(
                x, y, init_hidden, init_cell)

            sgd = paddle.static.amp.bf16.decorate_bf16(
                sgd,
                amp_lists=paddle.static.amp.bf16.AutoMixedPrecisionListsBF16(
                    custom_fp32_list={'transpose2', 'concat'}),
                use_bf16_guard=False,
                use_pure_bf16=True)

            sgd.minimize(static_loss, framework.default_startup_program())
            out = exe.run(framework.default_startup_program())

            for i in range(batch_num):
                x_data = np.arange(12).reshape(4, 3).astype('int64')
                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, 1))
                #TODO investigate initializing model with "float32" instead of "uint16" as it was before
                # slice_op PR(datatypes in model graph are different than datatypes during runtime because of that)
                init_hidden_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='uint16')
                init_cell_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='uint16')

                fetch_list = [
                    static_loss, static_last_hidden, static_last_cell
                ]
                out = exe.run(fluid.default_main_program(),
                              feed={
                                  "x": x_data,
                                  "y": y_data,
                                  "init_hidden": init_hidden_data,
                                  "init_cell": init_cell_data
                              },
                              fetch_list=fetch_list)

            # get value before save
            main_program = framework.default_main_program()
            base_map = {}
            for var in main_program.list_vars():
                if isinstance(var, framework.Parameter) or var.persistable:
                    t = np.array(fluid.global_scope().find_var(
                        var.name).get_tensor())
                    # make sure all the paramerter or optimizer var have been update
                    self.assertTrue(np.sum(np.abs(t)) != 0)
                    base_map[var.name] = t

            fluid.save(main_program, "./test_1")

            # set var to zero
            for var in main_program.list_vars():
                if isinstance(var, framework.Parameter) or var.persistable:
                    ten = fluid.global_scope().find_var(var.name).get_tensor()
                    ten.set(np.zeros_like(np.array(ten)), place)

                    new_t = np.array(fluid.global_scope().find_var(
                        var.name).get_tensor())
                    # make sure all the paramerter or optimizer var have been set to zero
                    self.assertTrue(np.sum(np.abs(new_t)) == 0)

            fluid.load(main_program, "./test_1.pdparams", exe)

            for var in main_program.list_vars():
                if isinstance(var, framework.Parameter) or var.persistable:
                    new_t = np.array(fluid.global_scope().find_var(
                        var.name).get_tensor())
                    base_t = base_map[var.name]
                    self.assertTrue(np.array_equal(new_t, base_t))
def save_model(program, model_path):
    """
    save model to the target path
    """
    fluid.save(program, model_path)
    logger.info("Already save model in {}".format(model_path))
Beispiel #22
0
def main(args):
    """
    Main Function
    """
    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
    else:
        place = fluid.CPUPlace()
    exe = fluid.Executor(place)

    task_name = args.task_name.lower()
    processor = reader.EmoTectProcessor(data_dir=args.data_dir,
                                        vocab_path=args.vocab_path,
                                        random_seed=args.random_seed)
    #num_labels = len(processor.get_labels())
    num_labels = args.num_labels

    if not (args.do_train or args.do_val or args.do_infer):
        raise ValueError("For args `do_train`, `do_val` and `do_infer`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = processor.data_generator(
            batch_size=args.batch_size, phase='train', epoch=args.epoch)

        num_train_examples = processor.get_num_examples(phase="train")
        max_train_steps = args.epoch * num_train_examples // args.batch_size + 1

        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)

        train_program = fluid.Program()
        if args.random_seed is not None:
            train_program.random_seed = args.random_seed

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_loader, loss, accuracy, num_seqs = create_model(
                    args, num_labels=num_labels, is_prediction=False)

                sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.lr)
                sgd_optimizer.minimize(loss)

        if args.verbose:
            lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val:
        if args.do_train:
            test_data_generator = processor.data_generator(
                batch_size=args.batch_size, phase='dev', epoch=1)
        else:
            test_data_generator = processor.data_generator(
                batch_size=args.batch_size, phase='test', epoch=1)

        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_loader, loss, accuracy, num_seqs = create_model(
                    args, num_labels=num_labels, is_prediction=False)
        test_prog = test_prog.clone(for_test=True)

    if args.do_infer:
        infer_data_generator = processor.data_generator(
            batch_size=args.batch_size, phase='infer', epoch=1)

        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                infer_loader, probs, _ = create_model(args,
                                                      num_labels=num_labels,
                                                      is_prediction=True)
        test_prog = test_prog.clone(for_test=True)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint:
            utils.init_checkpoint(exe,
                                  args.init_checkpoint,
                                  main_program=startup_prog)
    elif args.do_val or args.do_infer:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or infer!")
        utils.init_checkpoint(exe,
                              args.init_checkpoint,
                              main_program=test_prog)

    if args.do_train:
        train_exe = exe
        train_loader.set_sample_list_generator(train_data_generator)
    else:
        train_exe = None
    if args.do_val:
        test_exe = exe
        test_loader.set_sample_list_generator(test_data_generator)
    if args.do_infer:
        test_exe = exe
        infer_loader.set_sample_list_generator(infer_data_generator)

    if args.do_train:
        train_loader.start()
        steps = 0
        total_cost, total_acc, total_num_seqs = [], [], []
        time_begin = time.time()
        ce_info = []
        while True:
            try:
                steps += 1
                if steps % args.skip_steps == 0:
                    fetch_list = [loss.name, accuracy.name, num_seqs.name]
                else:
                    fetch_list = []

                outputs = train_exe.run(program=train_program,
                                        fetch_list=fetch_list,
                                        return_numpy=False)
                if steps % args.skip_steps == 0:
                    np_loss, np_acc, np_num_seqs = outputs
                    np_loss = np.array(np_loss)
                    np_acc = np.array(np_acc)
                    np_num_seqs = np.array(np_num_seqs)
                    total_cost.extend(np_loss * np_num_seqs)
                    total_acc.extend(np_acc * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args.verbose:
                        verbose = "train loader queue size: %d, " % train_loader.queue.size(
                        )
                        print(verbose)

                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("step: %d, avg loss: %f, "
                          "avg acc: %f, speed: %f steps/s" %
                          (steps, np.sum(total_cost) / np.sum(total_num_seqs),
                           np.sum(total_acc) / np.sum(total_num_seqs),
                           args.skip_steps / used_time))
                    ce_info.append([
                        np.sum(total_cost) / np.sum(total_num_seqs),
                        np.sum(total_acc) / np.sum(total_num_seqs), used_time
                    ])
                    total_cost, total_acc, total_num_seqs = [], [], []
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.save_checkpoint_dir,
                                             "step_" + str(steps))
                    fluid.save(train_program, save_path)

                if steps % args.validation_steps == 0:
                    # evaluate on dev set
                    if args.do_val:
                        evaluate(test_exe, test_prog, test_loader,
                                 [loss.name, accuracy.name, num_seqs.name],
                                 "dev")

            except fluid.core.EOFException:
                print("final step: %d " % steps)
                if args.do_val:
                    evaluate(test_exe, test_prog, test_loader,
                             [loss.name, accuracy.name, num_seqs.name], "dev")

                save_path = os.path.join(args.save_checkpoint_dir,
                                         "step_" + str(steps))
                fluid.save(train_program, save_path)
                train_loader.reset()
                break

    if args.do_train and args.enable_ce:
        card_num = get_cards()
        ce_loss = 0
        ce_acc = 0
        ce_time = 0
        try:
            ce_loss = ce_info[-2][0]
            ce_acc = ce_info[-2][1]
            ce_time = ce_info[-2][2]
        except:
            print("ce info error")
        print("kpis\teach_step_duration_%s_card%s\t%s" %
              (task_name, card_num, ce_time))
        print("kpis\ttrain_loss_%s_card%s\t%f" %
              (task_name, card_num, ce_loss))
        print("kpis\ttrain_acc_%s_card%s\t%f" % (task_name, card_num, ce_acc))

    # evaluate on test set
    if not args.do_train and args.do_val:
        print("Final test result:")
        evaluate(test_exe, test_prog, test_loader,
                 [loss.name, accuracy.name, num_seqs.name], "test")

    # infer
    if args.do_infer:
        print("Final infer result:")
        infer(test_exe, test_prog, infer_loader, [probs.name], "infer")
Beispiel #23
0
    def train():
        ce_time = []
        ce_ppl = []
        max_epoch = args.max_epoch
        kl_w = args.kl_start
        lr_w = args.learning_rate
        best_valid_nll = 1e100  # +inf
        best_epoch_id = -1
        decay_cnt = 0
        max_decay = args.max_decay
        decay_factor = 0.5
        decay_ts = 2
        steps_not_improved = 0
        for epoch_id in range(max_epoch):
            start_time = time.time()
            if args.enable_ce:
                train_data_iter = reader.get_data_iter(train_data,
                                                       batch_size,
                                                       args.sort_cache,
                                                       args.cache_num,
                                                       enable_ce=True)
            else:
                train_data_iter = reader.get_data_iter(train_data, batch_size,
                                                       args.sort_cache,
                                                       args.cache_num)

            total_loss = 0
            total_rec_loss = 0
            total_kl_loss = 0
            word_count = 0.0
            batch_count = 0.0
            batch_times = []
            for batch_id, batch in enumerate(train_data_iter):
                batch_start_time = time.time()
                kl_w = min(1.0, kl_w + anneal_r)
                kl_weight = kl_w
                input_data_feed, src_word_num, dec_word_sum = prepare_input(
                    batch, kl_weight, lr_w)
                fetch_outs = exe.run(
                    program=train_program,
                    feed=input_data_feed,
                    fetch_list=[loss.name, kl_loss.name, rec_loss.name],
                    use_program_cache=False)

                cost_train = np.array(fetch_outs[0])
                kl_cost_train = np.array(fetch_outs[1])
                rec_cost_train = np.array(fetch_outs[2])

                total_loss += cost_train * batch_size
                total_rec_loss += rec_cost_train * batch_size
                total_kl_loss += kl_cost_train * batch_size
                word_count += dec_word_sum
                batch_count += batch_size
                batch_end_time = time.time()
                batch_time = batch_end_time - batch_start_time
                batch_times.append(batch_time)

                if batch_id > 0 and batch_id % 200 == 0:
                    print("-- Epoch:[%d]; Batch:[%d]; Time: %.4f s; "
                          "kl_weight: %.4f; kl_loss: %.4f; rec_loss: %.4f; "
                          "nll: %.4f; ppl: %.4f" %
                          (epoch_id, batch_id, batch_time, kl_w,
                           total_kl_loss / batch_count, total_rec_loss /
                           batch_count, total_loss / batch_count,
                           np.exp(total_loss / word_count)))
                    ce_ppl.append(np.exp(total_loss / word_count))

            end_time = time.time()
            epoch_time = end_time - start_time
            ce_time.append(epoch_time)
            print(
                "\nTrain epoch:[%d]; Epoch Time: %.4f; avg_time: %.4f s/step\n"
                % (epoch_id, epoch_time, sum(batch_times) / len(batch_times)))

            val_nll, val_ppl = eval(valid_data)
            print("dev ppl", val_ppl)
            test_nll, test_ppl = eval(test_data)
            print("test ppl", test_ppl)

            if val_nll < best_valid_nll:
                best_valid_nll = val_nll
                steps_not_improved = 0
                best_nll = test_nll
                best_ppl = test_ppl
                best_epoch_id = epoch_id
                save_path = os.path.join(args.model_path,
                                         "epoch_" + str(best_epoch_id),
                                         "checkpoint")
                print("save model {}".format(save_path))
                fluid.save(main_program, save_path)
            else:
                steps_not_improved += 1
                if steps_not_improved == decay_ts:
                    old_lr = lr_w
                    lr_w *= decay_factor
                    steps_not_improved = 0
                    new_lr = lr_w

                    print('-----\nchange lr, old lr: %f, new lr: %f\n-----' %
                          (old_lr, new_lr))

                    dir_name = args.model_path + "/epoch_" + str(best_epoch_id)
                    fluid.load(main_program, dir_name, exe)

                    decay_cnt += 1
                    if decay_cnt == max_decay:
                        break

        print('\nbest testing nll: %.4f, best testing ppl %.4f\n' %
              (best_nll, best_ppl))

        if args.enable_ce:
            card_num = get_cards()
            _ppl = 0
            _time = 0
            try:
                _time = ce_time[-1]
                _ppl = ce_ppl[-1]
            except:
                print("ce info error")
            print("kpis\ttrain_duration_card%s\t%s" % (card_num, _time))
            print("kpis\ttrain_ppl_card%s\t%f" % (card_num, _ppl))
    def testLoadStaticModel(self):
        # static mode
        a = fluid.data(name="a", shape=[10, 10])
        conv_in = fluid.data(name="conv_in", shape=[None, 10, 10, 10])

        fc_out1 = fluid.layers.fc(a, 10)
        fc_out2 = fluid.layers.fc(a, 20)

        conv_out_1 = fluid.layers.conv2d(conv_in,
                                         num_filters=10,
                                         filter_size=5,
                                         act="relu")
        conv_out_2 = fluid.layers.conv2d(conv_in,
                                         num_filters=10,
                                         filter_size=5,
                                         act="relu")

        conv3d_in = fluid.data(name='conv3d_in',
                               shape=[None, 3, 12, 32, 32],
                               dtype='float32')
        conv3d_out_1 = fluid.layers.conv3d(input=conv3d_in,
                                           num_filters=2,
                                           filter_size=3,
                                           act="relu")
        conv3d_out_2 = fluid.layers.conv3d(input=conv3d_in,
                                           num_filters=2,
                                           filter_size=3,
                                           act="relu")

        batchnorm_in = fluid.data(name="batchnorm_in",
                                  shape=[None, 10],
                                  dtype='float32')
        batchnorm_out_1 = fluid.layers.batch_norm(batchnorm_in)
        batchnorm_out_2 = fluid.layers.batch_norm(batchnorm_in)

        emb_in = fluid.data(name='emb_in', shape=[None, 10], dtype='int64')
        emb_out_1 = fluid.embedding(emb_in, [1000, 100])
        emb_out_2 = fluid.embedding(emb_in, [2000, 200])

        layernorm = fluid.data(name="ln", shape=[None, 10], dtype='float32')
        layernorm_1 = fluid.layers.layer_norm(layernorm)
        layernorm_2 = fluid.layers.layer_norm(layernorm)

        nce_in = fluid.data(name="nce_in", shape=[None, 100], dtype='float32')
        nce_label = fluid.data(name="nce_label",
                               shape=[None, 10],
                               dtype='int64')
        nce_out_1 = fluid.layers.nce(nce_in, nce_label, 10000)
        nce_out_2 = fluid.layers.nce(nce_in, nce_label, 10000)

        prelu_in = fluid.data(name="prelu_in",
                              shape=[None, 5, 10, 10],
                              dtype='float32')
        prelu_out_1 = fluid.layers.prelu(prelu_in, "channel")
        prelu_out_2 = fluid.layers.prelu(prelu_in, "channel")

        bilinear_tensor_pro_x = fluid.data("t1",
                                           shape=[None, 5],
                                           dtype="float32")
        bilinear_tensor_pro_y = fluid.data("t2",
                                           shape=[None, 4],
                                           dtype="float32")

        bilinear_tensor_pro_out_1 = fluid.layers.bilinear_tensor_product(
            x=bilinear_tensor_pro_x, y=bilinear_tensor_pro_y, size=1000)
        bilinear_tensor_pro_out_2 = fluid.layers.bilinear_tensor_product(
            x=bilinear_tensor_pro_x, y=bilinear_tensor_pro_y, size=1000)

        conv2d_trans_in = fluid.data(name="conv2d_trans_in",
                                     shape=[None, 10, 10, 10])

        conv2d_trans_out_1 = fluid.layers.conv2d_transpose(conv2d_trans_in,
                                                           num_filters=10,
                                                           filter_size=5,
                                                           act="relu")
        conv2d_trans_out_2 = fluid.layers.conv2d_transpose(conv2d_trans_in,
                                                           num_filters=10,
                                                           filter_size=5,
                                                           act="relu")

        conv3d_trans_in = fluid.data(name='conv3d_trans_in',
                                     shape=[None, 3, 12, 32, 32],
                                     dtype='float32')
        conv3d_trans_out_1 = fluid.layers.conv3d_transpose(
            input=conv3d_trans_in, num_filters=2, filter_size=3, act="relu")
        conv3d_trans_out_2 = fluid.layers.conv3d_transpose(
            input=conv3d_trans_in, num_filters=2, filter_size=3, act="relu")

        groupnorm_in = fluid.data(name='groupnorm_in',
                                  shape=[None, 8, 32, 32],
                                  dtype='float32')
        groupnorm_out1 = fluid.layers.group_norm(input=groupnorm_in, groups=4)
        groupnorm_out2 = fluid.layers.group_norm(input=groupnorm_in, groups=4)
        '''
        spec_norm = fluid.data(name='spec_norm', shape=[2, 8, 32, 32], dtype='float32')
        spe_norm_out_1 = fluid.layers.spectral_norm(weight=spec_norm, dim=1, power_iters=2)
        spe_norm_out_2 = fluid.layers.spectral_norm(weight=spec_norm, dim=1, power_iters=2)
        '''

        nodes_vector = fluid.data(name='vectors',
                                  shape=[None, 10, 5],
                                  dtype='float32')
        edge_set = fluid.data(name='edge_set',
                              shape=[None, 10, 2],
                              dtype='float32')
        tree_conv_out1 = fluid.contrib.layers.tree_conv(
            nodes_vector, edge_set, 6, 1, 2)
        tree_conv_out2 = fluid.contrib.layers.tree_conv(
            nodes_vector, edge_set, 6, 1, 2)

        para1 = fluid.layers.create_parameter([100, 100],
                                              'float32',
                                              name="weight_test_1")
        para2 = fluid.layers.create_parameter([20, 200],
                                              'float32',
                                              name="weight_test_2")

        para_list = fluid.default_main_program().list_vars()

        exe = fluid.Executor(fluid.CPUPlace(
        ) if not fluid.is_compiled_with_cuda() else fluid.CUDAPlace(0))
        out = exe.run(framework.default_startup_program())

        fluid.save(framework.default_main_program(), "./test_1")

        para_dict = fluid.load_program_state("./test_1")

        new_dict = {}
        for k, v in para_dict.items():
            #print( k, v.shape )
            if k.startswith("fc"):
                name = k.replace("fc", "linear", 1)
                new_dict[name] = v
            else:
                new_dict[k] = v

        with fluid.dygraph.guard():

            class MyTest(fluid.dygraph.Layer):
                def __init__(self):
                    super(MyTest, self).__init__()

                    self.linear1 = Linear(10, 10)
                    self.lienar2 = Linear(10, 20)

                    self.conv2d_1 = Conv2D(num_channels=10,
                                           num_filters=10,
                                           filter_size=5,
                                           act="relu")
                    self.conv2d_2 = Conv2D(num_channels=10,
                                           num_filters=10,
                                           filter_size=5,
                                           act="relu")

                    self.conv3d_1 = Conv3D(num_channels=3,
                                           num_filters=2,
                                           filter_size=3,
                                           act="relu")
                    self.conv3d_2 = Conv3D(num_channels=3,
                                           num_filters=2,
                                           filter_size=3,
                                           act="relu")

                    self.batch_norm_1 = BatchNorm(10)
                    self.batch_norm_2 = BatchNorm(10)

                    self.emb1 = Embedding([1000, 100])
                    self.emb2 = Embedding([2000, 200])

                    self.layer_norm_1 = LayerNorm([10])
                    self.layer_norm_2 = LayerNorm(10)

                    self.nce1 = NCE(10000, 100)
                    self.nce2 = NCE(10000, 100)

                    self.prelu1 = PRelu("channel", channel=5)
                    self.prelu2 = PRelu("channel", channel=5)

                    self.group_norm1 = GroupNorm(8, 4)
                    self.gourp_norm2 = GroupNorm(8, 4)

                    self.w_1 = self.create_parameter([100, 100],
                                                     dtype='float32',
                                                     attr="weight_test_1")
                    self.w_2 = self.create_parameter([20, 200],
                                                     dtype='float32',
                                                     attr="weight_test_2")

            my_test = MyTest()
            my_test.set_dict(new_dict, use_structured_name=False)
            for k, v in my_test.state_dict().items():
                self.assertTrue(np.array_equal(v.numpy(), new_dict[v.name]))
def train_static(args, batch_generator):
    paddle.manual_seed(SEED)
    paddle.framework.random._manual_program_seed(SEED)
    train_prog = fluid.Program()
    startup_prog = fluid.Program()

    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            # define input and reader
            input_field_names = util.encoder_data_input_fields + \
                                util.decoder_data_input_fields[:-1] + util.label_data_input_fields
            input_descs = util.get_input_descs(args)
            input_slots = [{
                "name": name,
                "shape": input_descs[name][0],
                "dtype": input_descs[name][1]
            } for name in input_field_names]
            input_field = util.InputField(input_slots)
            # Define DataLoader
            data_loader = fluid.io.DataLoader.from_generator(
                input_field.feed_list, capacity=60)
            data_loader.set_batch_generator(batch_generator, places=place)
            # define model
            transformer = Transformer(
                args.src_vocab_size, args.trg_vocab_size, args.max_length + 1,
                args.n_layer, args.n_head, args.d_key, args.d_value,
                args.d_model, args.d_inner_hid, args.prepostprocess_dropout,
                args.attention_dropout, args.relu_dropout, args.preprocess_cmd,
                args.postprocess_cmd, args.weight_sharing, args.bos_idx,
                args.eos_idx)
            logits = transformer(*input_field.feed_list[:7])
            # define loss
            criterion = CrossEntropyCriterion(args.label_smooth_eps)
            lbl_word, lbl_weight = input_field.feed_list[7:]
            sum_cost, avg_cost, token_num = criterion(logits, lbl_word,
                                                      lbl_weight)
            # define optimizer
            learning_rate = fluid.layers.learning_rate_scheduler.noam_decay(
                args.d_model, args.warmup_steps, args.learning_rate)
            optimizer = fluid.optimizer.Adam(
                learning_rate=learning_rate,
                beta1=args.beta1,
                beta2=args.beta2,
                epsilon=float(args.eps))
            optimizer.minimize(avg_cost)
            # the best cross-entropy value with label smoothing
            loss_normalizer = -((1. - args.label_smooth_eps) * np.log(
                (1. - args.label_smooth_eps)) + args.label_smooth_eps * np.log(
                    args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20))
    step_idx = 0
    total_batch_num = 0
    avg_loss = []
    exe = fluid.Executor(place)
    exe.run(startup_prog)
    for pass_id in range(args.epoch):
        batch_id = 0
        for feed_dict in data_loader:
            outs = exe.run(program=train_prog,
                           feed=feed_dict,
                           fetch_list=[sum_cost.name, token_num.name])
            if step_idx % args.print_step == 0:
                sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[
                    1])
                total_sum_cost = sum_cost_val.sum()
                total_token_num = token_num_val.sum()
                total_avg_cost = total_sum_cost / total_token_num
                avg_loss.append(total_avg_cost)
                if step_idx == 0:
                    logging.info(
                        "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                        "normalized loss: %f, ppl: %f" %
                        (step_idx, pass_id, batch_id, total_avg_cost,
                         total_avg_cost - loss_normalizer,
                         np.exp([min(total_avg_cost, 100)])))
                    avg_batch_time = time.time()
                else:
                    logging.info(
                        "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, "
                        "normalized loss: %f, ppl: %f, speed: %.2f steps/s" %
                        (step_idx, pass_id, batch_id, total_avg_cost,
                         total_avg_cost - loss_normalizer,
                         np.exp([min(total_avg_cost, 100)]),
                         args.print_step / (time.time() - avg_batch_time)))
                    avg_batch_time = time.time()
            batch_id += 1
            step_idx += 1
            total_batch_num = total_batch_num + 1
            if step_idx == STEP_NUM:
                if args.save_dygraph_model_path:
                    model_path = os.path.join(args.save_static_model_path,
                                              "transformer")
                    fluid.save(train_prog, model_path)
                break
    return np.array(avg_loss)
end_level = 3
for i in range(start_level, end_level + 1):
    if i == 0:
        w = mask_feat_head_dic['mask_feat_head.convs_all_levels.%d.conv0.conv.weight' % (i,)]
        scale = mask_feat_head_dic['mask_feat_head.convs_all_levels.%d.conv0.gn.weight' % (i,)]
        offset = mask_feat_head_dic['mask_feat_head.convs_all_levels.%d.conv0.gn.bias' % (i,)]
        copy_conv_gn('mask_feat_head.convs_all_levels.%d.conv0' % (i,), w, None, scale, offset)
        continue

    for j in range(i):
        w = mask_feat_head_dic['mask_feat_head.convs_all_levels.%d.conv%d.conv.weight' % (i, j)]
        scale = mask_feat_head_dic['mask_feat_head.convs_all_levels.%d.conv%d.gn.weight' % (i, j)]
        offset = mask_feat_head_dic['mask_feat_head.convs_all_levels.%d.conv%d.gn.bias' % (i, j)]
        copy_conv_gn('mask_feat_head.convs_all_levels.%d.conv%d' % (i, j), w, None, scale, offset)


w = mask_feat_head_dic['mask_feat_head.conv_pred.0.conv.weight']
scale = mask_feat_head_dic['mask_feat_head.conv_pred.0.gn.weight']
offset = mask_feat_head_dic['mask_feat_head.conv_pred.0.gn.bias']
copy_conv_gn('mask_feat_head.conv_pred.0', w, None, scale, offset)



import os
if not os.path.exists('output/'): os.mkdir('output/')
if not os.path.exists('output/solov2_light_448_r50_fpn_8gpu_3x/'): os.mkdir('output/solov2_light_448_r50_fpn_8gpu_3x/')
fluid.save(fluid.default_startup_program(), 'output/solov2_light_448_r50_fpn_8gpu_3x/model_final')
print('\nDone.')


Beispiel #27
0
 def save_model():
     if not os.path.exists(cfg.model_path):
         os.makedirs(cfg.model_path)
     fluid.save(program=fluid.default_main_program(),
                model_path=os.path.join(cfg.model_path, "model"))
     print("Saved model to: %s" % cfg.model_path)
Beispiel #28
0
def train(args):
    if args.enable_ce:
        SEED = 102
        fluid.default_startup_program().random_seed = SEED
        fluid.default_main_program().random_seed = SEED
    use_cuda = True if args.use_cuda else False
    parallel = True if args.parallel else False
    print("use_cuda:", use_cuda, "parallel:", parallel)
    train_reader, vocab_size = utils.construct_train_data(
        args.train_dir, args.vocab_path, args.batch_size * get_cards(args))
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    ssr = SequenceSemanticRetrieval(vocab_size, args.embedding_dim,
                                    args.hidden_size)
    # Train program
    train_input_data, cos_pos, avg_cost, acc = ssr.train()

    # Optimization to minimize lost
    optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
    optimizer.minimize(avg_cost)

    data_list = [var.name for var in train_input_data]
    print(data_list)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    loader = fluid.io.DataLoader.from_generator(feed_list=train_input_data,
                                                capacity=10000,
                                                iterable=True)
    loader.set_sample_list_generator(train_reader, places=place)
    if parallel:
        train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
                                           loss_name=avg_cost.name)
    else:
        train_exe = exe

    total_time = 0.0
    ce_info = []
    for pass_id in range(args.epochs):
        epoch_idx = pass_id + 1
        print("epoch_%d start" % epoch_idx)
        t0 = time.time()
        i = 0
        for batch_id, data in enumerate(loader()):
            i += 1
            loss_val, correct_val = train_exe.run(
                feed=data, fetch_list=[avg_cost.name, acc.name])
            ce_info.append(float(np.mean(correct_val)) / args.batch_size)
            if i % args.print_batch == 0:
                logger.info(
                    "Train --> pass: {} batch_id: {} avg_cost: {}, acc: {}".
                    format(pass_id, batch_id, np.mean(loss_val),
                           float(np.mean(correct_val)) / args.batch_size))
            if args.enable_ce and i > args.step_num:
                break
        t1 = time.time()
        total_time += t1 - t0
        print("epoch:%d num_steps:%d time_cost(s):%f" %
              (epoch_idx, i, total_time / epoch_idx))
        save_dir = "%s/epoch_%d" % (args.model_dir, epoch_idx)
        fluid.save(fluid.default_main_program(), model_path=save_dir)
        print("model saved in %s" % save_dir)

    # only for ce
    if args.enable_ce:
        ce_acc = 0
        try:
            ce_acc = ce_info[-2]
        except:
            print("ce info error")
        epoch_idx = args.epochs
        device = get_device(args)
        if args.use_cuda:
            gpu_num = device[1]
            print("kpis\teach_pass_duration_gpu%s\t%s" %
                  (gpu_num, total_time / epoch_idx))
            print("kpis\ttrain_acc_gpu%s\t%s" % (gpu_num, ce_acc))
        else:
            cpu_num = device[1]
            threads_num = device[2]
            print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" %
                  (cpu_num, threads_num, total_time / epoch_idx))
            print("kpis\ttrain_acc_cpu%s_thread%s\t%s" %
                  (cpu_num, threads_num, ce_acc))
def main(args):
    """
    Main Function
    """
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = task_reader.ClassifyReader(vocab_path=args.vocab_path,
                                        label_map_config=args.label_map_config,
                                        max_seq_len=args.max_seq_len,
                                        do_lower_case=args.do_lower_case,
                                        random_seed=args.random_seed)

    if not (args.do_train or args.do_val or args.do_infer):
        raise ValueError("For args `do_train`, `do_val` and `do_infer`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                # create ernie_pyreader
                train_pyreader, ernie_inputs, labels = ernie_pyreader(
                    args, pyreader_name='train_pyreader')

                # get ernie_embeddings
                if args.use_paddle_hub:
                    embeddings = ernie_encoder_with_paddle_hub(
                        ernie_inputs, args.max_seq_len)
                else:
                    embeddings = ernie_encoder(ernie_inputs,
                                               ernie_config=ernie_config)

                # user defined model based on ernie embeddings
                loss, accuracy, num_seqs = create_model(args,
                                                        embeddings,
                                                        labels=labels,
                                                        is_prediction=False)

                optimizer = fluid.optimizer.Adam(learning_rate=args.lr)
                optimizer.minimize(loss)

        if args.verbose:
            lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val:
        test_data_generator = reader.data_generator(input_file=args.dev_set,
                                                    batch_size=args.batch_size,
                                                    phase='dev',
                                                    epoch=1,
                                                    shuffle=False)
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                # create ernie_pyreader
                test_pyreader, ernie_inputs, labels = ernie_pyreader(
                    args, pyreader_name='eval_reader')

                # get ernie_embeddings
                if args.use_paddle_hub:
                    embeddings = ernie_encoder_with_paddle_hub(
                        ernie_inputs, args.max_seq_len)
                else:
                    embeddings = ernie_encoder(ernie_inputs,
                                               ernie_config=ernie_config)

                # user defined model based on ernie embeddings
                loss, accuracy, num_seqs = create_model(args,
                                                        embeddings,
                                                        labels=labels,
                                                        is_prediction=False)

        test_prog = test_prog.clone(for_test=True)

    if args.do_infer:
        infer_data_generator = reader.data_generator(
            input_file=args.test_set,
            batch_size=args.batch_size,
            phase='infer',
            epoch=1,
            shuffle=False)
        infer_prog = fluid.Program()
        with fluid.program_guard(infer_prog, startup_prog):
            with fluid.unique_name.guard():
                infer_pyreader, ernie_inputs, labels = ernie_pyreader(
                    args, pyreader_name="infer_pyreader")

                # get ernie_embeddings
                if args.use_paddle_hub:
                    embeddings = ernie_encoder_with_paddle_hub(
                        ernie_inputs, args.max_seq_len)
                else:
                    embeddings = ernie_encoder(ernie_inputs,
                                               ernie_config=ernie_config)

                probs = create_model(args,
                                     embeddings,
                                     labels=labels,
                                     is_prediction=True)

        infer_prog = infer_prog.clone(for_test=True)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=train_program)
    elif args.do_val:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe, args.init_checkpoint, main_program=test_prog)
    elif args.do_infer:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe, args.init_checkpoint, main_program=infer_prog)

    if args.do_train:
        train_exe = exe
        train_pyreader.set_batch_generator(train_data_generator)
    else:
        train_exe = None
    if args.do_val:
        test_exe = exe
        test_pyreader.set_batch_generator(test_data_generator)
    if args.do_infer:
        test_exe = exe
        infer_pyreader.set_batch_generator(infer_data_generator)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        total_cost, total_acc, total_num_seqs = [], [], []
        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps == 0:
                    fetch_list = [loss.name, accuracy.name, num_seqs.name]
                else:
                    fetch_list = []

                outputs = train_exe.run(program=train_program,
                                        fetch_list=fetch_list,
                                        return_numpy=False)
                if steps % args.skip_steps == 0:
                    np_loss, np_acc, np_num_seqs = outputs
                    np_loss = np.array(np_loss)
                    np_acc = np.array(np_acc)
                    np_num_seqs = np.array(np_num_seqs)
                    total_cost.extend(np_loss * np_num_seqs)
                    total_acc.extend(np_acc * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        print(verbose)

                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("step: %d, ave loss: %f, "
                          "ave acc: %f, speed: %f steps/s" %
                          (steps, np.sum(total_cost) / np.sum(total_num_seqs),
                           np.sum(total_acc) / np.sum(total_num_seqs),
                           args.skip_steps / used_time))
                    total_cost, total_acc, total_num_seqs = [], [], []
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps),
                                             "checkpoint")
                    fluid.save(train_program, save_path)

                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        evaluate(exe, test_prog, test_pyreader,
                                 [loss.name, accuracy.name, num_seqs.name],
                                 "dev")

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps), "checkpoint")
                fluid.save(train_program, save_path)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        print("Final validation result:")
        evaluate(exe, test_prog, test_pyreader,
                 [loss.name, accuracy.name, num_seqs.name], "dev")

    # final eval on test set
    if args.do_infer:
        print("Final test result:")
        infer(exe, infer_prog, infer_pyreader, [probs.name], "infer")
Beispiel #30
0
 def save_controller(self, program, output_dir):
     fluid.save(program, output_dir)