def SaveModel(train_model, s_iter): # prefix = "gpu_{}".format(train_model._devices[0]) print(train_model.net.Proto()) print("==============") print(data_parallel_model.GetCheckpointParams(train_model)) predictor_export_meta = pred_exp.PredictorExportMeta( predict_net=train_model.net.Proto(), parameters=data_parallel_model.GetCheckpointParams(train_model), inputs=["data"], outputs=["softmax"], shapes={ "softmax": (1, num_labels), "data": (3, img_size, img_size) }) # save the train_model for the current epoch model_path = "%s/%s_%d.mdl" % ( file_store_path, save_model_name, s_iter, ) print(model_path) # set db_type to be "minidb" instead of "log_file_db", which breaks # the serialization in save_to_db. Need to switch back to log_file_db # after migration pred_exp.save_to_db( db_type="minidb", db_destination=model_path, predictor_export_meta=predictor_export_meta, )
def SaveModel(args, train_model, epoch): prefix = "[]_{}".format(train_model._device_prefix, train_model._devices[0]) predictor_export_meta = pred_exp.PredictorExportMeta( predict_net=train_model.net.Proto(), parameters=data_parallel_model.GetCheckpointParams(train_model), inputs=[prefix + "/data"], outputs=[prefix + "/softmax"], shapes={ prefix + "/softmax": (1, args.num_labels), prefix + "/data": (args.num_channels, args.image_size, args.image_size) } ) # save the train_model for the current epoch model_path = "%s/%s_%d.mdl" % ( args.file_store_path, args.save_model_name, epoch, ) # set db_type to be "minidb" instead of "log_file_db", which breaks # the serialization in save_to_db. Need to switch back to log_file_db # after migration pred_exp.save_to_db( db_type="minidb", db_destination=model_path, predictor_export_meta=predictor_export_meta, )
def test_checkpoint_params(self): def add_input_ops(model): pass def add_model_ops(model, loss_scale): model.NHWC2NCHW("data", "data_nchw") model.Conv("data_nchw", 'conv1', 3, 64, weight_init=("MSRAFill", {}), kernel=7, stride=2, pad=3, no_bias=0) model.SpatialBN('conv1', 'conv1_spatbn_relu', 64, epsilon=1e-3) model.Relu('conv1_spatbn_relu', 'conv1_spatbn_relu') model.MaxPool('conv1_spatbn_relu', 'pool1', kernel=3, stride=2) model.FC('pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=100) model.Sigmoid('fc', 'fc_sigm') model.Softmax('fc_sigm', 'softmax') model.LabelCrossEntropy(['softmax', 'label'], 'xent') loss = model.AveragedLoss('xent', 'loss') # Add a duplicate param init to ensure it does not cause issues model.param_init_net.ConstantFill([], ["fc_w"], shape=((64 * 56 * 56), 1000)) return [loss] def add_optimizer(model): optimizer.build_sgd(model, 0.1, policy="fixed", momentum=0.9) model = cnn.CNNModelHelper( order="NHWC", name="test", ) data_parallel_model.Parallelize_CPU( model, input_builder_fun=add_input_ops, forward_pass_builder_fun=add_model_ops, optimizer_builder_fun=add_optimizer, devices=[1, 2, 3], ) # Only gpu_1 params should be returned (gpu_1 is the first gpu) checkpoint_params = data_parallel_model.GetCheckpointParams(model) for p in model.GetParams("cpu_1/"): self.assertTrue(p in checkpoint_params) self.assertTrue(p + "_momentum" in checkpoint_params) for p in model.GetParams("cpu_2/"): self.assertFalse(p in checkpoint_params) self.assertTrue( core.BlobReference("cpu_1/fc_w_momentum") in checkpoint_params) for c in model.GetComputedParams("cpu_1/"): self.assertTrue(c in checkpoint_params) for c in model.GetComputedParams("cpu_2/"): self.assertFalse(c in checkpoint_params) self.assertFalse(core.BlobReference("cpu_1/data") in checkpoint_params) self.assertTrue( core.BlobReference("optimizer_iteration") in checkpoint_params)
def add_parameter_update_ops(model): model.Iter("ITER") LR = model.param_init_net.ConstantFill([], 'LR', shape=[1], value=0.1) for param in model.GetParams(): param_grad = model.param_to_grad[param] param_momentum = model.param_init_net.ConstantFill([param], param + '_momentum', value=0.0) model.net.MomentumSGDUpdate( [param_grad, param_momentum, LR, param], [param_grad, param_momentum, param], ) model = cnn.CNNModelHelper( order="NHWC", name="test", ) data_parallel_model.Parallelize_GPU( model, input_builder_fun=add_input_ops, forward_pass_builder_fun=add_model_ops, param_update_builder_fun=add_parameter_update_ops, devices=[1, 2, 3], ) # Only gpu_1 params should be returned (gpu_1 is the first gpu) checkpoint_params = data_parallel_model.GetCheckpointParams(model) for p in model.GetParams("gpu_1/"): self.assertTrue(p in checkpoint_params) self.assertTrue(p + "_momentum" in checkpoint_params) for p in model.GetParams("gpu_2/"): self.assertTrue(p in checkpoint_params) for c in model.GetComputedParams("gpu_1/"): self.assertFalse(c in checkpoint_params) for c in model.GetComputedParams("gpu_2/"): self.assertFalse(c in checkpoint_params) self.assertFalse( core.BlobReference("gpu_1/data") in checkpoint_params) self.assertTrue( core.BlobReference("gpu_1/ITER") in checkpoint_params)
data_parallel_model.GetLearningRateBlobNames(train_model)[0]) values = [ e + 1, lr, loss_sum / batch_num, correct / batch_num, test_res['loss'], test_res['accuracy'], time_ep, ] table = tabulate.tabulate([values], columns, tablefmt='simple', floatfmt='8.4f') if e % 25 == 0: table = table.split('\n') table = '\n'.join([table[1]] + table) else: table = table.split('\n')[2] print(table) checkpoint_params = data_parallel_model.GetCheckpointParams(train_model) init_net, _ = mobile_exporter.Export(workspace, deploy_model.net, checkpoint_params) with open("predict_net.pb", 'wb') as f: f.write(deploy_model.net._net.SerializeToString()) with open("init_net.pb", 'wb') as f: f.write(init_net.SerializeToString())