Python Parallelize_BMUF 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: caffe2.python.data_parallel_model

메소드/함수: Parallelize_BMUF

hotexamples.com에서의 예제들: 2

Python Parallelize_BMUF - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 caffe2.python.data_parallel_model.Parallelize_BMUF에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def bmuf_process(filestore_dir,
                 process_id,
                 shared_results,
                 cpu_device=False,
                 nesterov=False):
    # We need to import caffe2 in every process to initialize CUDA independently.
    from caffe2.python import core, cnn, data_parallel_model, dyndep, workspace
    from caffe2.proto import caffe2_pb2
    dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")

    if not cpu_device:
        if not workspace.has_gpu_support and not workspace.has_hip_support:
            log.info('No GPU support test is Ignored.')
            return
        if workspace.NumGpuDevices() < 4:
            log.info('Not enough GPU support, test IGNORED')
            return

    model = cnn.CNNModelHelper(order="NHWC", name="test")
    if not cpu_device:
        device_type = workspace.GpuDeviceType
        device_prefix = "gpu"
    else:
        device_type = caffe2_pb2.CPU
        device_prefix = "cpu"

    devices = [0, 1] if process_id == 0 else [2, 3]

    def _model_build_fun(model, loss_scale):
        fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                      ("ConstantFill", {}))
        fc_fl = model.FlattenToVec(fc, "fc_fl")
        sigm = model.Sigmoid(fc_fl, "sigm")
        sq = model.SquaredL2Distance([sigm, "label"], "sq")
        loss = model.AveragedLoss(sq, "loss")
        loss = model.Scale(loss, scale=loss_scale)

        # For testing explicit sync
        model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
        return [loss]

    def _input_builder_fun(model):
        return None

    def _param_update_fun(model):
        ITER = model.Iter("ITER")
        LR = model.net.LearningRate(
            [ITER],
            "LR",
            base_lr=(-0.1),
            policy="fixed",
        )
        ONE = model.param_init_net.ConstantFill(
            [],
            "ONE",
            shape=[1],
            value=1.0,
        )
        for param in model.GetParams():
            grad = model.param_to_grad[param]
            model.WeightedSum([param, ONE, grad, LR], param)

    def _generate_data(devices, process_id, device_type, device_prefix):
        np.random.seed(26 + process_id * 10)
        # Each run has same input, independent of number of gpus
        batch_size = 64
        for _ in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(devices)

            for (j, g) in enumerate(devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(device_type, g)):
                    workspace.FeedBlob("{}_{}/data".format(device_prefix, g),
                                       data)
                    workspace.FeedBlob("{}_{}/label".format(device_prefix, g),
                                       labels)

    _generate_data(devices, process_id, device_type, device_prefix)

    workspace.RunOperatorOnce(
        core.CreateOperator("FileStoreHandlerCreate", [], ["store_handler"],
                            path=filestore_dir))
    rendezvous = dict(kv_handler="store_handler",
                      shard_id=process_id,
                      num_shards=2,
                      engine="GLOO",
                      exit_nets=None)

    data_parallel_model.Parallelize_BMUF(model,
                                         _input_builder_fun,
                                         _model_build_fun,
                                         _param_update_fun,
                                         devices=devices,
                                         rendezvous=rendezvous,
                                         nesterov=nesterov,
                                         add_blobs_to_sync=["sync_num"],
                                         cpu_device=cpu_device)

    data_parallel_model.RunInitNet(model)

    def _device_pid(device, pid):
        if pid == 1:
            return device + 2
        return device

    np.testing.assert_equal(
        workspace.FetchBlob("{}_{}/fc_w_v".format(device_prefix,
                                                  _device_pid(0, process_id))),
        np.zeros(16).astype(np.float32).reshape(1, 16))

    # Run the algorithm for one iteration to have non-zero params.
    data_parallel_model.RunNet(model, 1)

    # Save iteration momentum and post local update params
    results = {}
    v_b_ = workspace.FetchBlob("{}_{}/fc_b_v".format(
        device_prefix, _device_pid(0, process_id)))
    v_w_ = workspace.FetchBlob("{}_{}/fc_w_v".format(
        device_prefix, _device_pid(0, process_id)))

    results['v_b_'] = v_b_
    results['v_w_'] = v_w_

    workspace.RunNetOnce(model.net)

    b_0_ = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix,
                                                   _device_pid(0, process_id)))
    w_0_ = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix,
                                                   _device_pid(0, process_id)))
    b_1_ = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix,
                                                   _device_pid(1, process_id)))
    w_1_ = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix,
                                                   _device_pid(1, process_id)))

    results['b_0_'] = b_0_
    results['w_0_'] = w_0_
    results['b_1_'] = b_1_
    results['w_1_'] = w_1_

    # Test sync
    if process_id == 0:
        workspace.FeedBlob(device_prefix + "_0/sync_num",
                           np.array([2603]).astype(np.float32),
                           device_option=core.DeviceOption(device_type, 0))

    # Compute block gradients.
    b_g_ = workspace.FetchBlob("{}_{}/fc_b_g".format(
        device_prefix, _device_pid(0, process_id)))
    w_g_ = workspace.FetchBlob("{}_{}/fc_w_g".format(
        device_prefix, _device_pid(0, process_id)))
    results['b_g_'] = b_g_
    results['w_g_'] = w_g_
    workspace.RunNetOnce(model._global_model_param_updates_net)

    #  g_b = (b_0_ + b_1_) / 2 - b_g_
    #  g_w = (w_0_ + w_1_) / 2 - w_g_
    v_b = workspace.FetchBlob("{}_{}/fc_b_v".format(device_prefix,
                                                    _device_pid(0,
                                                                process_id)))
    v_w = workspace.FetchBlob("{}_{}/fc_w_v".format(device_prefix,
                                                    _device_pid(0,
                                                                process_id)))
    w_g = workspace.FetchBlob("{}_{}/fc_w_g".format(device_prefix,
                                                    _device_pid(0,
                                                                process_id)))
    b_g = workspace.FetchBlob("{}_{}/fc_b_g".format(device_prefix,
                                                    _device_pid(0,
                                                                process_id)))
    w_0 = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix,
                                                  _device_pid(0, process_id)))
    b_0 = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix,
                                                  _device_pid(0, process_id)))
    w_1 = workspace.FetchBlob("{}_{}/fc_w".format(device_prefix,
                                                  _device_pid(1, process_id)))
    b_1 = workspace.FetchBlob("{}_{}/fc_b".format(device_prefix,
                                                  _device_pid(1, process_id)))
    results['v_b'] = v_b
    results['v_w'] = v_w
    results['w_g'] = w_g
    results['b_g'] = b_g
    results['w_0'] = w_0
    results['b_0'] = b_0
    results['w_1'] = w_1
    results['b_1'] = b_1

    # Test add_blobs_to_sync
    for j in devices:
        sync = workspace.FetchBlob(device_prefix + "_{}/sync_num".format(j))[0]
        results['sync_{}'.format(j)] = sync

    shared_results[process_id] = results

예제 #2

파일 보기

파일: data_parallel_model_test.py 프로젝트: timruning/caffe2

    def test_parallelize_bmuf(self, cpu_device):
        assume(cpu_device or workspace.has_gpu_support)

        workspace.ResetWorkspace()

        model = cnn.CNNModelHelper(order="NHWC", name="test")
        devices = [0, 1]

        def input_builder_fun(model):
            return None

        if not cpu_device:
            device_type = caffe2_pb2.CUDA
            device_prefix = "gpu"
        else:
            device_type = caffe2_pb2.CPU
            device_prefix = "cpu"
        self._generate_data(devices, device_type, device_prefix)

        data_parallel_model.Parallelize_BMUF(model,
                                             input_builder_fun,
                                             self._model_build_fun,
                                             self._param_update_fun,
                                             devices=devices,
                                             cpu_device=cpu_device)

        data_parallel_model.RunInitNet(model)

        # Check initial momentum params are zeros
        self.assertEqual(list(viewkeys(model._device_grouped_blobs)),
                         ['fc_w', 'fc_b'])
        self.assertEqual(
            workspace.FetchBlob('{}_0/fc_b_v'.format(device_prefix)), 0)
        np.testing.assert_equal(
            workspace.FetchBlob('{}_0/fc_w_v'.format(device_prefix)),
            np.zeros(16).astype(np.float32).reshape(1, 16))

        # Run the algorithm for one iteration to have non-zero params.
        data_parallel_model.RunNet(model, 1)

        # Save iteration momentum and post local update params
        v_b_ = workspace.FetchBlob('{}_0/fc_b_v'.format(device_prefix))
        v_w_ = workspace.FetchBlob('{}_0/fc_w_v'.format(device_prefix))

        workspace.RunNetOnce(model.net)

        b_0_ = workspace.FetchBlob('{}_0/fc_b'.format(device_prefix))
        w_0_ = workspace.FetchBlob('{}_0/fc_w'.format(device_prefix))
        b_1_ = workspace.FetchBlob('{}_1/fc_b'.format(device_prefix))
        w_1_ = workspace.FetchBlob('{}_1/fc_w'.format(device_prefix))

        # Compute block gradients.
        b_g_ = workspace.FetchBlob('{}_0/fc_b_g'.format(device_prefix))
        w_g_ = workspace.FetchBlob('{}_0/fc_w_g'.format(device_prefix))
        workspace.RunNetOnce(model._global_model_param_updates_net)

        g_b = (b_0_ + b_1_) / 2 - b_g_
        g_w = (w_0_ + w_1_) / 2 - w_g_
        v_b = workspace.FetchBlob('{}_0/fc_b_v'.format(device_prefix))
        v_w = workspace.FetchBlob('{}_0/fc_w_v'.format(device_prefix))

        w_g = workspace.FetchBlob('{}_0/fc_w_g'.format(device_prefix))
        b_g = workspace.FetchBlob('{}_0/fc_b_g'.format(device_prefix))
        w_0 = workspace.FetchBlob('{}_0/fc_w'.format(device_prefix))
        b_0 = workspace.FetchBlob('{}_0/fc_b'.format(device_prefix))
        w_1 = workspace.FetchBlob('{}_1/fc_w'.format(device_prefix))
        b_1 = workspace.FetchBlob('{}_1/fc_b'.format(device_prefix))

        # Check momentum update step
        np.testing.assert_equal(v_b, 0.5 * v_b_ + g_b)
        np.testing.assert_equal(v_w, 0.5 * v_w_ + g_w)

        np.testing.assert_equal(w_g, w_0)
        np.testing.assert_equal(w_g, w_1)
        np.testing.assert_equal(b_g, b_0)
        np.testing.assert_equal(b_g, b_1)

        # Check params update step
        np.testing.assert_equal(w_0, w_g_ + v_w)
        np.testing.assert_equal(b_0, b_g_ + v_b)