예제 #1
0
def SaveModel(train_model, s_iter):
    # prefix = "gpu_{}".format(train_model._devices[0])
    print(train_model.net.Proto())
    print("==============")
    print(data_parallel_model.GetCheckpointParams(train_model))
    predictor_export_meta = pred_exp.PredictorExportMeta(
        predict_net=train_model.net.Proto(),
        parameters=data_parallel_model.GetCheckpointParams(train_model),
        inputs=["data"],
        outputs=["softmax"],
        shapes={
            "softmax": (1, num_labels),
            "data": (3, img_size, img_size)
        })

    # save the train_model for the current epoch
    model_path = "%s/%s_%d.mdl" % (
        file_store_path,
        save_model_name,
        s_iter,
    )
    print(model_path)

    # set db_type to be "minidb" instead of "log_file_db", which breaks
    # the serialization in save_to_db. Need to switch back to log_file_db
    # after migration
    pred_exp.save_to_db(
        db_type="minidb",
        db_destination=model_path,
        predictor_export_meta=predictor_export_meta,
    )
예제 #2
0
def SaveModel(args, train_model, epoch):
    prefix = "[]_{}".format(train_model._device_prefix, train_model._devices[0])
    predictor_export_meta = pred_exp.PredictorExportMeta(
        predict_net=train_model.net.Proto(),
        parameters=data_parallel_model.GetCheckpointParams(train_model),
        inputs=[prefix + "/data"],
        outputs=[prefix + "/softmax"],
        shapes={
            prefix + "/softmax": (1, args.num_labels),
            prefix + "/data": (args.num_channels, args.image_size, args.image_size)
        }
    )

    # save the train_model for the current epoch
    model_path = "%s/%s_%d.mdl" % (
        args.file_store_path,
        args.save_model_name,
        epoch,
    )

    # set db_type to be "minidb" instead of "log_file_db", which breaks
    # the serialization in save_to_db. Need to switch back to log_file_db
    # after migration
    pred_exp.save_to_db(
        db_type="minidb",
        db_destination=model_path,
        predictor_export_meta=predictor_export_meta,
    )
예제 #3
0
    def test_checkpoint_params(self):
        def add_input_ops(model):
            pass

        def add_model_ops(model, loss_scale):
            model.NHWC2NCHW("data", "data_nchw")
            model.Conv("data_nchw",
                       'conv1',
                       3,
                       64,
                       weight_init=("MSRAFill", {}),
                       kernel=7,
                       stride=2,
                       pad=3,
                       no_bias=0)
            model.SpatialBN('conv1', 'conv1_spatbn_relu', 64, epsilon=1e-3)
            model.Relu('conv1_spatbn_relu', 'conv1_spatbn_relu')
            model.MaxPool('conv1_spatbn_relu', 'pool1', kernel=3, stride=2)
            model.FC('pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=100)
            model.Sigmoid('fc', 'fc_sigm')
            model.Softmax('fc_sigm', 'softmax')
            model.LabelCrossEntropy(['softmax', 'label'], 'xent')
            loss = model.AveragedLoss('xent', 'loss')

            # Add a duplicate param init to ensure it does not cause issues
            model.param_init_net.ConstantFill([], ["fc_w"],
                                              shape=((64 * 56 * 56), 1000))
            return [loss]

        def add_optimizer(model):
            optimizer.build_sgd(model, 0.1, policy="fixed", momentum=0.9)

        model = cnn.CNNModelHelper(
            order="NHWC",
            name="test",
        )
        data_parallel_model.Parallelize_CPU(
            model,
            input_builder_fun=add_input_ops,
            forward_pass_builder_fun=add_model_ops,
            optimizer_builder_fun=add_optimizer,
            devices=[1, 2, 3],
        )

        # Only gpu_1 params should be returned (gpu_1 is the first gpu)
        checkpoint_params = data_parallel_model.GetCheckpointParams(model)
        for p in model.GetParams("cpu_1/"):
            self.assertTrue(p in checkpoint_params)
            self.assertTrue(p + "_momentum" in checkpoint_params)
        for p in model.GetParams("cpu_2/"):
            self.assertFalse(p in checkpoint_params)
        self.assertTrue(
            core.BlobReference("cpu_1/fc_w_momentum") in checkpoint_params)
        for c in model.GetComputedParams("cpu_1/"):
            self.assertTrue(c in checkpoint_params)
        for c in model.GetComputedParams("cpu_2/"):
            self.assertFalse(c in checkpoint_params)
        self.assertFalse(core.BlobReference("cpu_1/data") in checkpoint_params)
        self.assertTrue(
            core.BlobReference("optimizer_iteration") in checkpoint_params)
예제 #4
0
        def add_parameter_update_ops(model):
            model.Iter("ITER")
            LR = model.param_init_net.ConstantFill([],
                                                   'LR',
                                                   shape=[1],
                                                   value=0.1)
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                param_momentum = model.param_init_net.ConstantFill([param],
                                                                   param +
                                                                   '_momentum',
                                                                   value=0.0)
                model.net.MomentumSGDUpdate(
                    [param_grad, param_momentum, LR, param],
                    [param_grad, param_momentum, param],
                )
            model = cnn.CNNModelHelper(
                order="NHWC",
                name="test",
            )
            data_parallel_model.Parallelize_GPU(
                model,
                input_builder_fun=add_input_ops,
                forward_pass_builder_fun=add_model_ops,
                param_update_builder_fun=add_parameter_update_ops,
                devices=[1, 2, 3],
            )

            # Only gpu_1 params should be returned (gpu_1 is the first gpu)
            checkpoint_params = data_parallel_model.GetCheckpointParams(model)
            for p in model.GetParams("gpu_1/"):
                self.assertTrue(p in checkpoint_params)
                self.assertTrue(p + "_momentum" in checkpoint_params)
            for p in model.GetParams("gpu_2/"):
                self.assertTrue(p in checkpoint_params)
            for c in model.GetComputedParams("gpu_1/"):
                self.assertFalse(c in checkpoint_params)
            for c in model.GetComputedParams("gpu_2/"):
                self.assertFalse(c in checkpoint_params)
            self.assertFalse(
                core.BlobReference("gpu_1/data") in checkpoint_params)
            self.assertTrue(
                core.BlobReference("gpu_1/ITER") in checkpoint_params)
예제 #5
0
            data_parallel_model.GetLearningRateBlobNames(train_model)[0])

        values = [
            e + 1,
            lr,
            loss_sum / batch_num,
            correct / batch_num,
            test_res['loss'],
            test_res['accuracy'],
            time_ep,
        ]
        table = tabulate.tabulate([values],
                                  columns,
                                  tablefmt='simple',
                                  floatfmt='8.4f')
        if e % 25 == 0:
            table = table.split('\n')
            table = '\n'.join([table[1]] + table)
        else:
            table = table.split('\n')[2]
        print(table)

    checkpoint_params = data_parallel_model.GetCheckpointParams(train_model)

    init_net, _ = mobile_exporter.Export(workspace, deploy_model.net,
                                         checkpoint_params)
    with open("predict_net.pb", 'wb') as f:
        f.write(deploy_model.net._net.SerializeToString())
    with open("init_net.pb", 'wb') as f:
        f.write(init_net.SerializeToString())