Beispiel #1
0
predict_net_proto = caffe2_pb2.NetDef()
with open(PREDICT_NET, "r") as f:
    predict_net_proto.ParseFromString(f.read())
test_model.net = test_model.net.AppendNet(core.Net(predict_net_proto))

# Add an accuracy feature to the model for convenient reporting during testing
accuracy = brew.accuracy(test_model, ['softmax', 'label'], 'accuracy')

# ### Run Testing
#
# At this point, our model is initialized as the saved model from Part 1. We can now run the testing loop and check the accuracy.

# In[6]:

# Run the param init net to put the trained model info into the workspace
workspace.RunNetOnce(test_model.param_init_net)
workspace.CreateNet(test_model.net, overwrite=True)

# Stat keeper
avg_accuracy = 0.0

# Number of test iterations to run here, since the full test set is 10k images and the
#  batch size is 1, we will run 10000 test batches to cover the entire test set
test_iters = 10000

# Main testing loop
for i in range(test_iters):
    workspace.RunNet(test_model.net)
    acc = workspace.FetchBlob('accuracy')
    avg_accuracy += acc
    if (i % 500 == 0) and (i > 0):
def bmuf_process(filestore_dir, process_id, shared_results, nesterov=False):
    # We need to import caffe2 in every process to initialize CUDA independently.
    from caffe2.python import core, cnn, data_parallel_model, workspace, dyndep
    from caffe2.proto import caffe2_pb2
    dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")

    if not workspace.has_gpu_support:
        log.info('No GPU support test is Ignored.')
        return

    if workspace.NumCudaDevices() < 4:
        log.info('Not enough GPU support, test IGNORED')
        return

    model = cnn.CNNModelHelper(order="NHWC", name="test")

    gpu_ids = [0, 1] if process_id == 0 else [2, 3]

    def _model_build_fun(model, loss_scale):
        fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                      ("ConstantFill", {}))
        fc_fl = model.FlattenToVec(fc, "fc_fl")
        sigm = model.Sigmoid(fc_fl, "sigm")
        sq = model.SquaredL2Distance([sigm, "label"], "sq")
        loss = model.AveragedLoss(sq, "loss")
        loss = model.Scale(loss, scale=loss_scale)
        return [loss]

    def _input_builder_fun(model):
        return None

    def _param_update_fun(model):
        ITER = model.Iter("ITER")
        LR = model.net.LearningRate(
            [ITER],
            "LR",
            base_lr=(-0.1),
            policy="fixed",
        )
        ONE = model.param_init_net.ConstantFill(
            [],
            "ONE",
            shape=[1],
            value=1.0,
        )
        for param in model.GetParams():
            grad = model.param_to_grad[param]
            model.WeightedSum([param, ONE, grad, LR], param)

    def _generate_data(gpu_devices, process_id):
        np.random.seed(26 + process_id * 10)
        # Each run has same input, independent of number of gpus
        batch_size = 64
        for _ in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(gpu_devices)

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/data".format(g), data)
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

    _generate_data(gpu_ids, process_id)

    workspace.RunOperatorOnce(
        core.CreateOperator("FileStoreHandlerCreate", [], ["store_handler"],
                            path=filestore_dir))
    rendezvous = dict(kv_handler="store_handler",
                      shard_id=process_id,
                      num_shards=2,
                      engine="GLOO",
                      exit_nets=None)

    data_parallel_model.Parallelize_GPU_BMUF(model,
                                             _input_builder_fun,
                                             _model_build_fun,
                                             _param_update_fun,
                                             devices=gpu_ids,
                                             rendezvous=rendezvous,
                                             nesterov=nesterov)

    data_parallel_model.RunInitNet(model)

    def _gpu_pid(gpu_id, pid):
        if pid == 1:
            return gpu_id + 2
        return gpu_id

    np.testing.assert_equal(
        workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id))),
        np.zeros(16).astype(np.float32).reshape(1, 16))

    # Run the algorithm for one iteration to have non-zero params.
    data_parallel_model.RunNet(model, 1)

    # Save iteration momentum and post local update params
    results = {}
    v_b_ = workspace.FetchBlob("gpu_{}/fc_b_v".format(_gpu_pid(0, process_id)))
    v_w_ = workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id)))

    results['v_b_'] = v_b_
    results['v_w_'] = v_w_

    workspace.RunNetOnce(model.net)

    b_0_ = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(0, process_id)))
    w_0_ = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(0, process_id)))
    b_1_ = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(1, process_id)))
    w_1_ = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(1, process_id)))

    results['b_0_'] = b_0_
    results['w_0_'] = w_0_
    results['b_1_'] = b_1_
    results['w_1_'] = w_1_

    # Compute block gradients.
    b_g_ = workspace.FetchBlob("gpu_{}/fc_b_g".format(_gpu_pid(0, process_id)))
    w_g_ = workspace.FetchBlob("gpu_{}/fc_w_g".format(_gpu_pid(0, process_id)))
    results['b_g_'] = b_g_
    results['w_g_'] = w_g_
    workspace.RunNetOnce(model._global_model_param_updates_net)

    #  g_b = (b_0_ + b_1_) / 2 - b_g_
    #  g_w = (w_0_ + w_1_) / 2 - w_g_
    v_b = workspace.FetchBlob("gpu_{}/fc_b_v".format(_gpu_pid(0, process_id)))
    v_w = workspace.FetchBlob("gpu_{}/fc_w_v".format(_gpu_pid(0, process_id)))
    w_g = workspace.FetchBlob("gpu_{}/fc_w_g".format(_gpu_pid(0, process_id)))
    b_g = workspace.FetchBlob("gpu_{}/fc_b_g".format(_gpu_pid(0, process_id)))
    w_0 = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(0, process_id)))
    b_0 = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(0, process_id)))
    w_1 = workspace.FetchBlob("gpu_{}/fc_w".format(_gpu_pid(1, process_id)))
    b_1 = workspace.FetchBlob("gpu_{}/fc_b".format(_gpu_pid(1, process_id)))
    results['v_b'] = v_b
    results['v_w'] = v_w
    results['w_g'] = w_g
    results['b_g'] = b_g
    results['w_0'] = w_0
    results['b_0'] = b_0
    results['w_1'] = w_1
    results['b_1'] = b_1

    shared_results[process_id] = results
    def test_lstm_with_recurrent_attention(
        self,
        encoder_output_length,
        encoder_output_dim,
        decoder_input_length,
        decoder_state_dim,
        batch_size,
        lstm_mem_optim,
        attention_mem_optim,
        gc,
        dc,
    ):
        with core.DeviceScope(gc):
            model = CNNModelHelper(name="external")
            (
                encoder_outputs,
                decoder_inputs,
                decoder_input_lengths,
                initial_decoder_hidden_state,
                initial_decoder_cell_state,
                initial_attention_weighted_encoder_context,
            ) = model.net.AddExternalInputs(
                "encoder_outputs",
                "decoder_inputs",
                "decoder_input_lengths",
                "initial_decoder_hidden_state",
                "initial_decoder_cell_state",
                "initial_attention_weighted_encoder_context",
            )
            recurrent.LSTMWithAttention(
                model=model,
                decoder_inputs=decoder_inputs,
                decoder_input_lengths=decoder_input_lengths,
                initial_decoder_hidden_state=initial_decoder_hidden_state,
                initial_decoder_cell_state=initial_decoder_cell_state,
                initial_attention_weighted_encoder_context=(
                    initial_attention_weighted_encoder_context),
                encoder_output_dim=encoder_output_dim,
                encoder_outputs=encoder_outputs,
                decoder_input_dim=decoder_state_dim,
                decoder_state_dim=decoder_state_dim,
                scope='external/LSTMWithAttention',
                attention_type=AttentionType.Recurrent,
                lstm_memory_optimization=lstm_mem_optim,
                attention_memory_optimization=attention_mem_optim)
            op = model.net._net.op[-1]
        workspace.RunNetOnce(model.param_init_net)

        # This is original decoder_inputs after linear layer
        decoder_input_blob = op.input[0]

        workspace.FeedBlob(
            decoder_input_blob,
            np.random.randn(
                decoder_input_length,
                batch_size,
                decoder_state_dim * 4,
            ).astype(np.float32))
        workspace.FeedBlob(
            "external/LSTMWithAttention/encoder_outputs_transposed",
            np.random.randn(
                batch_size,
                encoder_output_dim,
                encoder_output_length,
            ).astype(np.float32),
        )
        workspace.FeedBlob(
            "external/LSTMWithAttention/weighted_encoder_outputs",
            np.random.randn(
                encoder_output_length,
                batch_size,
                encoder_output_dim,
            ).astype(np.float32),
        )
        workspace.FeedBlob(
            decoder_input_lengths,
            np.random.randint(0, decoder_input_length + 1,
                              size=(batch_size, )).astype(np.int32))
        workspace.FeedBlob(
            initial_decoder_hidden_state,
            np.random.randn(1, batch_size,
                            decoder_state_dim).astype(np.float32))
        workspace.FeedBlob(
            initial_decoder_cell_state,
            np.random.randn(1, batch_size,
                            decoder_state_dim).astype(np.float32))
        workspace.FeedBlob(
            initial_attention_weighted_encoder_context,
            np.random.randn(1, batch_size,
                            encoder_output_dim).astype(np.float32))
        inputs = [workspace.FetchBlob(name) for name in op.input]

        self.assertReferenceChecks(
            device_option=gc,
            op=op,
            inputs=inputs,
            reference=lstm_with_recurrent_attention_reference,
            grad_reference=None,
            output_to_grad=None,
            outputs_to_check=range(6),
        )
        gradients_to_check = [
            index for (index, input_name) in enumerate(op.input)
            if input_name != "decoder_input_lengths"
        ]
        for param in gradients_to_check:
            self.assertGradientChecks(
                device_option=gc,
                op=op,
                inputs=inputs,
                outputs_to_check=param,
                outputs_with_grads=[0, 4],
                threshold=0.01,
                stepsize=0.001,
            )
    def run_model(self, V, gpu_devices):
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            gpu_vecs_gathered = []
            gpu_vecs = []
            for num, vec in enumerate(self.vecs):
                gpu_vec = model.param_init_net.CopyCPUToGPU(
                    vec,
                    'gpuvec_{}'.format(num),
                )
                if num != 2:
                    model.params.append(gpu_vec)
                gpu_vecs.append(gpu_vec)
            for num, gpu_vec in enumerate(gpu_vecs):
                gpu_vec_gathered = model.net.Gather(
                    [gpu_vec, 'indices'], ['gpu_vec_gathered_{}'.format(num)])
                gpu_vecs_gathered.append(gpu_vec_gathered)

            assert len(gpu_vecs_gathered) == 3

            fc = model.net.FC(
                [
                    gpu_vecs_gathered[2],
                    gpu_vecs_gathered[0],
                    gpu_vecs_gathered[1],
                ],
                ['fc'],
            )
            _, loss = model.net.SoftmaxWithLoss(
                [fc, 'label'],
                ['ce_loss', 'avg_loss'],
                only_loss=True,
            )
            loss = model.Scale(loss, scale=loss_scale)
            model.net.Print(loss, [], limit=10)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    model.net.ScatterWeightedSum(
                        [
                            param,
                            ONE,
                            param_grad.indices,
                            param_grad.values,
                            ONE,
                        ],
                        param,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )
        batch_size = 32
        batch_per_device = batch_size // len(gpu_devices)

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                '''
                self.vecs consists of 3 big blobs on which we call Gather:
                1) FC weights, shape=(V, 16)
                2) FC bias, shape=(V)
                3) FC input, shape=(batch_per_device, 16)
                '''
                self.vecs = [
                    model.param_init_net.UniformFill([],
                                                     "vec_{}".format(num),
                                                     shape=[V, 16])
                    for num in range(2)
                ]
                self.vecs.append(
                    model.param_init_net.UniformFill(
                        [], "vec_2", shape=[batch_per_device, 16]))
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [],
                    "ONE_CPU",
                    shape=[1],
                    value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
            for num, vec in enumerate(self.vecs[:-1]):
                model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec)

        # Each run has same input, independent of number of gpus
        for i in range(0, 10):
            np.random.seed(2603)
            full_indices = np.random.permutation(V)[:batch_size].reshape(
                batch_size)
            full_labels = full_indices[:] % batch_per_device

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en].astype(np.int32)
                labels = full_labels[st:en].astype(np.int32)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = [
                    np.random.rand(V, 16).astype(np.float32),
                    np.random.rand(V).astype(np.float32),
                    np.random.rand(V, 16).astype(np.float32),
                ]
                for vec, orig_vec in zip(self.vecs, orig_vecs):
                    workspace.FeedBlob(vec, orig_vec)
                for g in gpu_devices:
                    for num, orig_vec in enumerate(orig_vecs):
                        workspace.FeedBlob(
                            "gpu_{}/gpuvec_{}".format(g, num),
                            orig_vec,
                            device_option=core.DeviceOption(
                                caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

            idx = workspace.FetchBlob('gpu_0/indices')
            grad_slices = [
                workspace.FetchBlob('gpu_{}/gpu_vec_gathered_{}_grad'.format(
                    g, num)) for g in gpu_devices for num in range(2)
            ]
            for grad_slice in grad_slices:
                # print (len(idx), len(grad_slice))
                assert len(idx) == len(grad_slice), (
                    'Number of indices {} is not same as number of gradient '
                    'slices {}. This might lead to illegal memory access'.
                    format(len(idx), len(grad_slice)))
def RunWarmup(model):
    workspace.RunNet(model.net, model._warmup_iterations)
    workspace.RunNetOnce(model._warmup_broadcast)
    def run_model(self, devices, gpu):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            workspace.FeedBlob(
                core.ScopedBlobReference("seq_lengths"),
                np.array([self.T] * self.batch_per_device, dtype=np.int32))
            model.param_init_net.ConstantFill(
                [],
                "hidden_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim])
            model.param_init_net.ConstantFill(
                [],
                "cell_init",
                value=0.0,
                shape=[1, self.batch_per_device, self.hidden_dim])

            output, _last_hidden, _, _last_state, = rnn_cell.LSTM(
                model=model,
                input_blob="data",
                seq_lengths="seq_lengths",
                initial_states=("hidden_init", "cell_init"),
                dim_in=self.input_dim,
                dim_out=self.hidden_dim,
                scope="partest",
            )

            # A silly loss function
            loss = model.AveragedLoss(
                model.Sub([output, "target"], "dist"),
                "loss",
            )
            loss = model.Scale(loss, "loss_scaled", scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ITER = model.Iter("ITER")
            LR = model.net.LearningRate(
                [ITER],
                "LR",
                base_lr=(-0.1),
                policy="fixed",
            )
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                model.WeightedSum([param, ONE, param_grad, LR], param)

            assert len(
                model.GetParams()) == len(model.params) // len(model._devices)

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(name="recurrent_test{}".format(devices), )

        self.T = 8
        self.batch_size = 64
        self.input_dim = 8
        self.hidden_dim = 31
        self.batch_per_device = self.batch_size // len(devices)

        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=devices,
            optimize_gradient_memory=True,
            cpu_device=not gpu,
        )

        # Change all initialization to be ConstantFills so that
        # the everything is deterministic
        for op in model.param_init_net.Proto().op:
            if op.type.endswith('Fill'):
                op.type = 'ConstantFill'

        # Each run has same input, independent of number of gpus
        np.random.seed(20150210)
        for i in range(0, 10):
            full_data = np.random.rand(self.T, self.batch_size, self.input_dim)
            full_target = np.random.rand(self.T, self.batch_size,
                                         self.hidden_dim)

            for (j, g) in enumerate(devices):
                st = j * self.batch_per_device
                en = st + self.batch_per_device
                data = full_data[:, st:en, :].astype(np.float32)
                targets = full_target[:, st:en, :].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type,
                                                        g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data)
                    workspace.FeedBlob(
                        "{}_{}/target".format(model._device_prefix, g),
                        targets)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)

        return workspace.FetchBlob("{}_0/partest/i2h_w".format(
            model._device_prefix))
    def run_model(self, V, gpu_devices, cpu_indices):
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            if cpu_indices:
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    gathered_cpu = model.net.Gather([self.vecs, 'indices'],
                                                    'gathered_cpu')

                gathered = model.CopyCPUToGPU(gathered_cpu, "gathered")
            else:
                gpu_vecs = model.param_init_net.CopyCPUToGPU(
                    self.vecs,
                    "gpuvecs",
                )
                model.params.append(gpu_vecs)
                gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered')
            flattened = model.Flatten(gathered, "flattened")
            fc = model.FC(flattened, "fc", 16 * 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)
            return [loss]

        def param_update_fun(model):
            ONE = model.param_init_net.ConstantFill(
                [],
                "ONE",
                shape=[1],
                value=1.0,
            )
            LR = model.CopyCPUToGPU(self.LR, "LR")
            for param in model.GetParams():
                param_grad = model.param_to_grad[param]
                if not isinstance(param_grad, core.GradientSlice):
                    model.WeightedSum([param, ONE, param_grad, LR], param)
                else:
                    param_momentum = model.param_init_net.ConstantFill(
                        [param],
                        param + '_momentum',
                        value=0.0,
                    )
                    model.net.SparseMomentumSGDUpdate(
                        [
                            param_grad.values,
                            param_momentum,
                            LR,
                            param,
                            param_grad.indices,
                        ],
                        [param_grad.values, param_momentum, param],
                        momentum=0.1,
                        nesterov=0,
                    )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="sparse_test{}".format(gpu_devices),
        )

        with core.NameScope("cpu"):
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                self.ITER = model.Iter("ITER")
                self.LR = model.net.LearningRate(
                    [self.ITER],
                    "LR",
                    base_lr=(-0.1),
                    policy="fixed",
                )
                self.vecs = model.param_init_net.UniformFill([],
                                                             "vecs",
                                                             shape=[V, 16])
                if cpu_indices:
                    model.params.append(self.vecs)
                self.ONE_CPU = model.param_init_net.ConstantFill(
                    [],
                    "ONE_CPU",
                    shape=[1],
                    value=1.0,
                )

        data_parallel_model.Parallelize_GPU(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            param_update_builder_fun=param_update_fun,
            devices=gpu_devices,
        )

        # Update the vecs
        if cpu_indices:
            with core.NameScope("cpu"):
                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
                    for param in model.GetParams():
                        param_grad = model.param_to_grad[param]
                        model.ScatterWeightedSum([
                            param, self.ONE_CPU, param_grad.indices,
                            param_grad.values, self.LR
                        ], self.vecs)
        else:
            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
                model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs)

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_indices = np.random.permutation(V)[:batch_size * 16].reshape(
                batch_size, 16)
            full_labels = full_indices[:, 0] % 2
            batch_per_device = batch_size // len(gpu_devices)

            for (j, g) in enumerate(gpu_devices):
                st = j * batch_per_device
                en = st + batch_per_device
                indices = full_indices[st:en, :].astype(np.int32)
                labels = full_labels[st:en].astype(np.float32)

                device_for_indices = core.DeviceOption(caffe2_pb2.CPU)
                if not cpu_indices:
                    device_for_indices = core.DeviceOption(caffe2_pb2.CUDA, g)

                with core.DeviceScope(device_for_indices):
                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)

                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, g)):
                    workspace.FeedBlob("gpu_{}/label".format(g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                # Force vecs to be same on all runs
                orig_vecs = np.random.rand(V, 16).astype(np.float32)
                workspace.FeedBlob(self.vecs, orig_vecs)
                if not cpu_indices:
                    for g in gpu_devices:
                        workspace.FeedBlob(
                            "gpu_{}/gpuvecs".format(g),
                            orig_vecs,
                            device_option=core.DeviceOption(
                                caffe2_pb2.CUDA, g),
                        )
                workspace.CreateNet(model.net)

            workspace.RunNet(model.net.Proto().name)
            if len(gpu_devices) == 2:
                if not cpu_indices:
                    idx = workspace.FetchBlob("gpu_0/indices")
                    idx = list(idx.flatten())
                    n = len(idx)
                    nu = len(set(idx))
                    assert n == nu, "We cannot have duplicate indices"

        # Sanity check to see the vecs were updated
        self.assertFalse(np.allclose(workspace.FetchBlob(self.vecs),
                                     orig_vecs))
        return [
            workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"),
            workspace.FetchBlob("gpu_0/fc_w")
        ]
Beispiel #8
0
    def test_imageinput(self, size_tuple, means, stds, gc, dc):
        # TODO: Does not test on GPU and does not test use_gpu_transform
        # WARNING: Using ModelHelper automatically does NHWC to NCHW
        # transformation if needed.
        width, height, minsize, crop = size_tuple
        means = [float(m) for m in means]
        stds = [float(s) for s in stds]
        out_dir = tempfile.mkdtemp()
        count_images = 2  # One with bounding box and one without
        expected_images = create_test(out_dir,
                                      width=width,
                                      height=height,
                                      default_bound=(3, 5, height - 3,
                                                     width - 5),
                                      minsize=minsize,
                                      crop=crop,
                                      means=means,
                                      stds=stds,
                                      count=count_images)
        for device_option in dc:
            with hu.temp_workspace():
                reader_net = core.Net('reader')
                reader_net.CreateDB([], 'DB', db=out_dir, db_type="lmdb")
                workspace.RunNetOnce(reader_net)
                imageop = core.CreateOperator(
                    'ImageInput', ['DB'], ["data", "label"],
                    batch_size=count_images,
                    color=3,
                    minsize=minsize,
                    crop=crop,
                    is_test=True,
                    bounding_ymin=3,
                    bounding_xmin=5,
                    bounding_height=height - 3,
                    bounding_width=width - 5,
                    mean_per_channel=means,
                    std_per_channel=stds,
                    use_gpu_transform=(device_option.device_type == 1))

                imageop.device_option.CopyFrom(device_option)
                main_net = core.Net('main')
                main_net.Proto().op.extend([imageop])
                workspace.RunNetOnce(main_net)
                l = workspace.FetchBlob('label')
                result = workspace.FetchBlob('data').astype(np.int32)
                # If we don't use_gpu_transform, the output is in NHWC
                # Our reference output is CHW so we swap
                if device_option.device_type != 1:
                    expected = [
                        img.swapaxes(0, 1).swapaxes(1, 2)
                        for img in expected_images
                    ]
                else:
                    expected = expected_images
                for i in range(count_images):
                    self.assertEqual(l[i], i)
                    self.assertEqual((expected[i] - result[i] > 1).sum(), 0)
                # End for
            # End with
        # End for
        shutil.rmtree(out_dir)
Beispiel #9
0
def _prepare_rnn(t,
                 n,
                 dim_in,
                 create_rnn,
                 outputs_with_grads,
                 forget_bias,
                 memory_optim=False,
                 forward_only=False,
                 drop_states=False,
                 T=None,
                 two_d_initial_states=None,
                 dim_out=None):
    if dim_out is None:
        dim_out = [dim_in]
    print("Dims: ", t, n, dim_in, dim_out)

    model = ModelHelper(name='external')

    if two_d_initial_states is None:
        two_d_initial_states = np.random.randint(2)

    def generate_input_state(n, d):
        if two_d_initial_states:
            return np.random.randn(n, d).astype(np.float32)
        else:
            return np.random.randn(1, n, d).astype(np.float32)

    states = []
    for layer_id, d in enumerate(dim_out):
        h, c = model.net.AddExternalInputs(
            "hidden_init_{}".format(layer_id),
            "cell_init_{}".format(layer_id),
        )
        states.extend([h, c])
        workspace.FeedBlob(h, generate_input_state(n, d).astype(np.float32))
        workspace.FeedBlob(c, generate_input_state(n, d).astype(np.float32))

    # Due to convoluted RNN scoping logic we make sure that things
    # work from a namescope
    with scope.NameScope("test_name_scope"):
        input_blob, seq_lengths = model.net.AddScopedExternalInputs(
            'input_blob', 'seq_lengths')

        outputs = create_rnn(
            model,
            input_blob,
            seq_lengths,
            states,
            dim_in=dim_in,
            dim_out=dim_out,
            scope="external/recurrent",
            outputs_with_grads=outputs_with_grads,
            memory_optimization=memory_optim,
            forget_bias=forget_bias,
            forward_only=forward_only,
            drop_states=drop_states,
            static_rnn_unroll_size=T,
        )

    workspace.RunNetOnce(model.param_init_net)

    workspace.FeedBlob(
        seq_lengths,
        np.random.randint(1, t + 1, size=(n, )).astype(np.int32))
    return outputs, model.net, states + [input_blob]
Beispiel #10
0
    def caffe2_yellowfin(self, zero_debias, grad_coef, n_dim, n_iter, gpu):
        caffe2_res = {}

        alpha = 1.0
        mu = 0.0
        beta = 0.999
        curv_win_width = 20
        epsilon = 1e-6

        net = core.Net("net")
        param_init_net = core.Net("param_init_net")
        workspace.ResetWorkspace()

        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
            iteration = param_init_net.ConstantFill([],
                                                    "iteration",
                                                    shape=[1],
                                                    value=0,
                                                    dtype=core.DataType.INT64)
            iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"])
            net.AtomicIter([iter_mutex, iteration], [iteration])
        pre_grad = param_init_net.ConstantFill([],
                                               "pre_grad",
                                               shape=[n_dim],
                                               value=grad_coef)
        if gpu:
            iteration = net.CopyCPUToGPU([iteration], "iteration_cpu")
        iteration_float = net.Cast([iteration], "iteration_float")
        grad = net.Mul([pre_grad, iteration_float], "grad", broadcast=True)
        w = param_init_net.ConstantFill([], "w", shape=[n_dim], value=0.0)

        # a hack to create an object with __dict__
        param_info = lambda: None
        param_info.blob = w
        param_info.grad = grad

        optimizer.YellowFinOptimizer(alpha=alpha,
                                     mu=mu,
                                     beta=beta,
                                     curv_win_width=curv_win_width,
                                     epsilon=epsilon,
                                     zero_debias=zero_debias)._run(
                                         net, param_init_net, param_info)

        workspace.RunNetOnce(param_init_net)
        workspace.CreateNet(net, overwrite=True)
        for i in range(n_iter):
            workspace.RunNet(net)
            scalars_memory_blob = workspace.FetchBlob("w_scalars_memory")
            g_norm2_avg = scalars_memory_blob[1]
            g_norm2_min_avg = scalars_memory_blob[2]
            g_norm2_max_avg = scalars_memory_blob[3]
            distance_avg = scalars_memory_blob[4]
            g_avg_blob = workspace.FetchBlob("w_g_avg")
            res_lr = workspace.FetchBlob("w_lr_avg")[0]
            res_mu = workspace.FetchBlob("w_mu_avg")[0]
            g_deb = self.deb(g_avg_blob, beta, i + 1, zero_debias)
            variance = max(
                self.deb(g_norm2_avg, beta, i + 1, zero_debias) -
                g_deb.dot(g_deb), epsilon)
            if i > 0:
                caffe2_res[i] = {
                    'h_max':
                    np.exp(self.deb(g_norm2_max_avg, beta, i + 1,
                                    zero_debias)),
                    'h_min':
                    np.exp(self.deb(g_norm2_min_avg, beta, i + 1,
                                    zero_debias)),
                    'var':
                    variance,
                    'dist':
                    self.deb(distance_avg, beta, i + 1, zero_debias),
                    'lr':
                    res_lr,
                    'mu':
                    res_mu
                }
        return caffe2_res
def run_test(size_tuple,
             means,
             stds,
             label_type,
             num_labels,
             is_test,
             scale_jitter_type,
             color_jitter,
             color_lighting,
             dc,
             validator,
             output1=None,
             output2_size=None):
    # TODO: Does not test on GPU and does not test use_gpu_transform
    # WARNING: Using ModelHelper automatically does NHWC to NCHW
    # transformation if needed.
    width, height, minsize, crop = size_tuple
    means = [float(m) for m in means]
    stds = [float(s) for s in stds]
    out_dir = tempfile.mkdtemp()
    count_images = 2  # One with bounding box and one without
    expected_images = create_test(out_dir,
                                  width=width,
                                  height=height,
                                  default_bound=(3, 5, height - 3, width - 5),
                                  minsize=minsize,
                                  crop=crop,
                                  means=means,
                                  stds=stds,
                                  count=count_images,
                                  label_type=label_type,
                                  num_labels=num_labels,
                                  output1=output1,
                                  output2_size=output2_size)
    for device_option in dc:
        with hu.temp_workspace():
            reader_net = core.Net('reader')
            reader_net.CreateDB([], 'DB', db=out_dir, db_type="lmdb")
            workspace.RunNetOnce(reader_net)
            outputs = ['data', 'label']
            output_sizes = []
            if output1:
                outputs.append('output1')
                output_sizes.append(1)
            if output2_size:
                outputs.append('output2')
                output_sizes.append(output2_size)
            imageop = core.CreateOperator(
                'ImageInput', ['DB'],
                outputs,
                batch_size=count_images,
                color=3,
                minsize=minsize,
                crop=crop,
                is_test=is_test,
                bounding_ymin=3,
                bounding_xmin=5,
                bounding_height=height - 3,
                bounding_width=width - 5,
                mean_per_channel=means,
                std_per_channel=stds,
                use_gpu_transform=(device_option.device_type == 1),
                label_type=label_type,
                num_labels=num_labels,
                output_sizes=output_sizes,
                scale_jitter_type=scale_jitter_type,
                color_jitter=color_jitter,
                color_lighting=color_lighting)

            imageop.device_option.CopyFrom(device_option)
            main_net = core.Net('main')
            main_net.Proto().op.extend([imageop])
            workspace.RunNetOnce(main_net)
            validator(expected_images, device_option, count_images)
            # End for
        # End with
    # End for
    shutil.rmtree(out_dir)
Beispiel #12
0
def run_conv_or_fc(
    test_case,
    init_net,
    net,
    X,
    W,
    b,
    op_type,
    engine,
    order,
    gc,
    outputs,
    scale=None,
    zero_point=None,
    x_scale=None,
    x_zero_point=None,
):
    if order:
        # Conv
        Output = collections.namedtuple("Output", ["Y", "op_type", "engine", "order"])
    else:
        # FC
        Output = collections.namedtuple("Output", ["Y", "op_type", "engine"])

    # We run DNNLOWP ops multiple times to test their first runs that
    # do caching so exercises different code paths from the subsequent
    # runs

    # self.ws.run re-creates operator every time so this test covers
    # cases when we have multiple nets sharing the same workspace
    test_case.ws.create_blob("X").feed(X, device_option=gc)
    test_case.ws.create_blob("W").feed(W, device_option=gc)
    test_case.ws.create_blob("b").feed(b, device_option=gc)
    if scale is not None and zero_point is not None:
        with workspace.WorkspaceGuard(test_case.ws):
            dnnlowp_pybind11.CreateInt8QuantParamsBlob(
                "quant_param", float(scale), int(zero_point)
            )
    if x_scale is not None and x_zero_point is not None:
        with workspace.WorkspaceGuard(test_case.ws):
            dnnlowp_pybind11.CreateInt8QuantParamsBlob(
                "X_quant_param", float(x_scale), int(x_zero_point)
            )

    if init_net:
        test_case.ws.run(init_net)
    for i in range(1 if engine == "" else 2):
        test_case.ws.run(net)
        Y = test_case.ws.blobs["Y"].fetch()
        if order:
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))
        else:
            outputs.append(Output(Y=Y, op_type=op_type, engine=engine))

    # workspace.CreateNet + workspace.RunNet reuses the same operator
    if engine != "":
        workspace.FeedBlob("X", X)
        workspace.FeedBlob("W", W)
        workspace.FeedBlob("b", b)
        if scale is not None and zero_point is not None:
            dnnlowp_pybind11.CreateInt8QuantParamsBlob(
                "quant_param", float(scale), int(zero_point)
            )
        if x_scale is not None and x_zero_point is not None:
            dnnlowp_pybind11.CreateInt8QuantParamsBlob(
                "X_quant_param", float(x_scale), int(x_zero_point)
            )

        if init_net:
            workspace.RunNetOnce(init_net)
        workspace.CreateNet(net)
        for i in range(2):
            workspace.RunNet(net)
            Y = workspace.FetchBlob("Y")
            if order:
                outputs.append(Output(Y=Y, op_type=op_type, engine=engine, order=order))
            else:
                outputs.append(Output(Y=Y, op_type=op_type, engine=engine))
Beispiel #13
0
    def test_convolution_grouped_sum_relu_fusion(self, stride, pad, kernel,
                                                 size, input_channels,
                                                 output_channels, batch_size,
                                                 use_bias, group, gc, dc):
        conv_S0 = core.CreateOperator(
            "Conv", ["SX0", "Sw0", "Sb0"] if use_bias else ["SX0", "Sw0"],
            ["S0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0])
        conv = core.CreateOperator(
            "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0])
        sum = core.CreateOperator("Sum", ["S0", "Y0"], ["S0"],
                                  device_option=dc[0])
        relu = core.CreateOperator("Relu", ["S0"], ["S0"], device_option=dc[0])

        SX = np.random.rand(batch_size, input_channels * group, size,
                            size).astype(np.float32) - 0.5
        Sw = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        Sb = np.random.rand(output_channels * group).astype(np.float32) - 0.5
        X = np.random.rand(batch_size, input_channels * group, size,
                           size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('SX0', SX, dc[0])
        workspace.FeedBlob('Sw0', Sw, dc[0])
        workspace.FeedBlob('Sb0', Sb, dc[0])
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(conv_S0)
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(sum)
        workspace.RunOperatorOnce(relu)
        S0 = workspace.FetchBlob('S0')

        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        conv_S0_old = caffe2_pb2.OperatorDef()
        conv_S0_old.CopyFrom(conv_S0)
        conv_S0_old.device_option.CopyFrom(dc[1])
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        sum_old = caffe2_pb2.OperatorDef()
        sum_old.CopyFrom(sum)
        sum_old.device_option.CopyFrom(dc[1])
        relu_old = caffe2_pb2.OperatorDef()
        relu_old.CopyFrom(relu)
        relu_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([conv_S0_old, conv_old, sum_old, relu_old])
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('Sw0', Sw, dc[1])
        workspace.FeedBlob('Sb0', Sb, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        workspace.RunNetOnce(net.Proto())
        # The output tensor name will be changed by optimization
        # sometimes when applying conv sum fusion
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        workspace.SwitchWorkspace(old_ws_name)
Beispiel #14
0
    def test_convolution_sum_fusion(self, stride, pad, kernel, size,
                                    input_channels, output_channels,
                                    batch_size, use_bias, group, sum_add, gc,
                                    dc):
        pool_S0 = core.CreateOperator("MaxPool", ["SX0"], ["S0"],
                                      stride=2,
                                      pad=0,
                                      kernel=2,
                                      device_option=dc[0])
        conv = core.CreateOperator(
            "Conv", ["X0", "w0", "b0"] if use_bias else ["X0", "w0"], ["Y0"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            device_option=dc[0])
        sum = core.CreateOperator(sum_add, ["S0", "Y0"], ["S0"],
                                  device_option=dc[0])

        # Manual fusion for Conv + Sum
        pool_S1 = core.CreateOperator("MaxPool", ["SX1"], ["S1"],
                                      stride=2,
                                      pad=0,
                                      kernel=2,
                                      group=group,
                                      device_option=dc[1])
        conv_fusion = core.CreateOperator(
            "ConvFusion",
            ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"],
            ["S1"],
            stride=stride,
            pad=pad,
            kernel=kernel,
            group=group,
            fusion_type=2,
            device_option=dc[1])
        pool_input_size = int(
            math.ceil(float(size + 2 * pad - kernel + 1) / stride)) * 2
        SX = np.random.rand(batch_size, output_channels * group,
                            pool_input_size, pool_input_size).astype(
                                np.float32) - 0.5
        X = np.random.rand(batch_size, input_channels * group, size,
                           size).astype(np.float32) - 0.5
        w = np.random.rand(
                output_channels * group, input_channels, kernel, kernel) \
            .astype(np.float32) - 0.5
        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5

        old_ws_name = workspace.CurrentWorkspace()
        workspace.SwitchWorkspace("_device_check_", True)
        workspace.FeedBlob('SX0', SX, dc[0])
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(pool_S0)
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(sum)
        S0 = workspace.FetchBlob('S0')

        workspace.ResetWorkspace()
        workspace.FeedBlob('SX1', SX, dc[1])
        workspace.FeedBlob('X1', X, dc[1])
        workspace.FeedBlob('w1', w, dc[1])
        workspace.FeedBlob('b1', b, dc[1])
        workspace.RunOperatorOnce(pool_S1)
        workspace.RunOperatorOnce(conv_fusion)
        S1 = workspace.FetchBlob('S1')

        if not np.allclose(S0, S1, atol=0.01, rtol=0.01):
            print(S1.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S1 - S0)))
            self.assertTrue(False)

        # Auto fusion for Conv + Sum
        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        pool_S0_old = caffe2_pb2.OperatorDef()
        pool_S0_old.CopyFrom(pool_S0)
        pool_S0_old.device_option.CopyFrom(dc[1])
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        sum_old = caffe2_pb2.OperatorDef()
        sum_old.CopyFrom(sum)
        sum_old.device_option.CopyFrom(dc[1])
        old_net.op.extend([pool_S0_old, conv_old, sum_old])

        # Conv + Sum should be fused case: [PreNode, Conv, Sum]
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 2)
        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
        workspace.RunNetOnce(net.Proto())
        # The output tensor name will be changed by optimization
        # sometimes when applying conv sum fusion
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        # Conv + Sum should be fused case: [Conv, PreNode, Sum]
        workspace.ResetWorkspace()
        old_net = caffe2_pb2.NetDef()
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        old_net.op.extend([conv_old, pool_S0_old, sum_old])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 2)
        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
        workspace.RunNetOnce(net.Proto())
        # The output tensor name will be changed by optimization
        # sometimes when applying conv sum fusion
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        # Conv + Sum should not be fused case: [Conv, midOp, preNode, Sum] Conv output is used by midOp
        dropout = core.CreateOperator("Dropout", ["Y0"], ["Y_dropout"],
                                      ratio=0.5,
                                      is_test=True,
                                      device_option=dc[1])

        workspace.ResetWorkspace()
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        old_net = caffe2_pb2.NetDef()
        old_net.op.extend([conv_old, dropout, pool_S0_old, sum_old])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 4)
        workspace.RunNetOnce(net.Proto())
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        # Conv + Sum should not be fused case: [Conv, preNode, Sum, midOp] preNode output is used by midOp
        sum1 = core.CreateOperator(sum_add, ["S0", "Y0"], ["S3"],
                                   device_option=dc[1])
        dropout = core.CreateOperator("Dropout", ["S0"], ["Y_dropout"],
                                      ratio=0.5,
                                      is_test=True,
                                      device_option=dc[1])

        workspace.ResetWorkspace()
        workspace.FeedBlob('SX0', SX, dc[1])
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        old_net = caffe2_pb2.NetDef()
        old_net.op.extend([conv_old, pool_S0_old, sum1, dropout])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        print("net={}\n".format(net.Proto()))
        self.assertTrue(len(net.Proto().op) == 4)
        workspace.RunNetOnce(net.Proto())
        S2 = workspace.FetchBlob(net.Proto().op[-2].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)

        # Conv + Sum should not be fused case: [Conv, midOp, preNode, Sum]
        # midOp output has the same name with that of the Conv input
        relu_0 = core.CreateOperator("Relu", ["X0"], ["X1"],
                                     device_option=dc[0])
        conv = core.CreateOperator(
            "Conv", ["X1", "w0", "b0"] if use_bias else ["X1", "w0"], ["Y0"],
            stride=1,
            pad=0,
            kernel=1,
            device_option=dc[0])
        relu_1 = core.CreateOperator("Relu", ["X1"], ["X1"],
                                     device_option=dc[0])
        pool = core.CreateOperator("MaxPool", ["X1"], ["S0"],
                                   stride=1,
                                   pad=0,
                                   kernel=1,
                                   device_option=dc[0])
        sum = core.CreateOperator("Sum", ["S0", "Y0"], ["S0"],
                                  device_option=dc[0])

        X = np.random.rand(batch_size, input_channels, size, size).astype(
            np.float32) - 0.5
        w = np.random.rand(input_channels, input_channels, 1, 1).astype(
            np.float32) - 0.5
        b = np.random.rand(input_channels).astype(np.float32) - 0.5

        workspace.SwitchWorkspace(old_ws_name)
        workspace.ResetWorkspace()
        workspace.FeedBlob('X0', X, dc[0])
        workspace.FeedBlob('w0', w, dc[0])
        workspace.FeedBlob('b0', b, dc[0])
        workspace.RunOperatorOnce(relu_0)
        workspace.RunOperatorOnce(conv)
        workspace.RunOperatorOnce(relu_1)
        workspace.RunOperatorOnce(pool)
        workspace.RunOperatorOnce(sum)
        S0 = workspace.FetchBlob('S0')

        workspace.ResetWorkspace()
        workspace.FeedBlob('X0', X, dc[1])
        workspace.FeedBlob('w0', w, dc[1])
        workspace.FeedBlob('b0', b, dc[1])
        relu_0_old = caffe2_pb2.OperatorDef()
        relu_0_old.CopyFrom(relu_0)
        relu_0_old.device_option.CopyFrom(dc[1])
        conv_old = caffe2_pb2.OperatorDef()
        conv_old.CopyFrom(conv)
        conv_old.device_option.CopyFrom(dc[1])
        relu_1_old = caffe2_pb2.OperatorDef()
        relu_1_old.CopyFrom(relu_1)
        relu_1_old.device_option.CopyFrom(dc[1])
        pool_old = caffe2_pb2.OperatorDef()
        pool_old.CopyFrom(pool)
        pool_old.device_option.CopyFrom(dc[1])
        sum_old = caffe2_pb2.OperatorDef()
        sum_old.CopyFrom(sum)
        sum_old.device_option.CopyFrom(dc[1])

        old_net = caffe2_pb2.NetDef()
        old_net.op.extend(
            [relu_0_old, conv_old, relu_1_old, pool_old, sum_old])
        net = core.Net("net")
        net.Proto().CopyFrom(old_net)
        optimizeForMKLDNN(net)
        self.assertTrue(len(net.Proto().op) == 5)
        workspace.RunNetOnce(net.Proto())
        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
            print(S2.flatten())
            print(S0.flatten())
            print(np.max(np.abs(S2 - S0)))
            self.assertTrue(False)
    def test_sparse_lengths_sum(self, num_rows, blocksize, weighted, seed,
                                empty_indices, fp16):
        net = core.Net("bench")

        np.random.seed(seed)

        if fp16:
            input_data = np.random.rand(num_rows, blocksize).astype(np.float16)
        else:
            input_data = np.random.rand(num_rows, blocksize).astype(np.float32)
        if empty_indices:
            lengths = np.zeros(num_rows, dtype=np.int32)
            num_indices = 0
        else:
            num_indices = np.random.randint(len(input_data))
            # the number of indices per sample
            lengths_split = np.clip(num_indices // 2, 1, 10)
            lengths = (
                np.ones([num_indices // lengths_split], dtype=np.int32) *
                lengths_split)
            # readjust num_indices when lengths_split doesn't divide num_indices
            num_indices = num_indices // lengths_split * lengths_split
        indices = np.random.randint(low=0,
                                    high=len(input_data),
                                    size=[num_indices],
                                    dtype=np.int32)
        weights = np.random.uniform(size=[len(indices)]).astype(np.float32)

        if fp16:
            quantized_data = net.HalfFloatToFused8BitRowwiseQuantized(
                "input_data", "quantized_data")
            dequantized_data = net.Fused8BitRowwiseQuantizedToHalfFloat(
                quantized_data, "dequantized_data")
        else:
            quantized_data = net.FloatToFused8BitRowwiseQuantized(
                "input_data", "quantized_data")
            dequantized_data = net.Fused8BitRowwiseQuantizedToFloat(
                quantized_data, "dequantized_data")

        if weighted:
            net.SparseLengthsWeightedSum(
                [dequantized_data, "weights", "indices", "lengths"],
                "sum_reference")
            net.SparseLengthsWeightedSumFused8BitRowwise(
                [quantized_data, "weights", "indices", "lengths"],
                "sum_quantized")
        else:
            net.SparseLengthsSum([dequantized_data, "indices", "lengths"],
                                 "sum_reference")
            net.SparseLengthsSumFused8BitRowwise(
                [quantized_data, "indices", "lengths"], "sum_quantized")

        workspace.FeedBlob("input_data", input_data)
        workspace.FeedBlob("weights", weights)
        workspace.FeedBlob("indices", indices)
        workspace.FeedBlob("lengths", lengths)

        workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"])
        workspace.CreateNet(net)
        workspace.RunNetOnce(net)

        dequantized_data = workspace.FetchBlob("dequantized_data")
        np.testing.assert_array_almost_equal(input_data,
                                             workspace.FetchBlob("input_data"))
        compare_rowwise(input_data, dequantized_data, fp16)

        sum_reference = workspace.FetchBlob("sum_reference")
        sum_quantized = workspace.FetchBlob("sum_quantized")
        if fp16:
            np.testing.assert_array_almost_equal(sum_reference,
                                                 sum_quantized,
                                                 decimal=3)
        else:
            np.testing.assert_array_almost_equal(sum_reference, sum_quantized)
    def Skip_test_SLS_NonQuantized_fp16(self):
        N = 20000
        DIM = 64
        D = (4 * np.random.random_sample((N, DIM)) + 1).astype(np.float32)
        I = (np.random.randint(0, N, size=12)).astype(np.int64)
        L = np.asarray([4, 4, 4]).astype(np.int32)
        workspace.FeedBlob("D", D)

        ref_c2_net = core.Net("test_ref_c2")
        ref_c2_net.SparseLengthsSum(["D", "I", "L"], "ref_out")
        ref_c2_net.Proto().external_input.extend(["D", "I", "L"])
        ref_c2_net.Proto().external_output.extend(["ref_out"])

        fp16_c2_net = core.Net("test_fp16_c2")
        fp16_c2_net.SparseLengthsSumFakeFP16AccFP16(["D", "I", "L"],
                                                    "fp16_out")

        input_dict = {}

        pred_net = caffe2_pb2.NetDef()
        pred_net.name = "pred"
        pred_net.external_input.extend(["D", "I", "L"])
        pred_net.external_output.append("glow_out")
        pred_net.op.add().CopyFrom(
            core.CreateOperator("SparseLengthsSum", ["D", "I", "L"],
                                ["glow_out"]))

        onnxified_net = onnxifi_caffe2_net(
            pred_net,
            input_dict,
            max_batch_size=3,
            max_seq_size=16,
            debug=True,
            adjust_batch=False,
            use_onnx=False,
        )

        num_onnxified_ops = sum(1 if op.type == "Onnxifi" else 0
                                for op in onnxified_net.op)
        print(onnxified_net)
        np.testing.assert_equal(num_onnxified_ops, 1)

        workspace.FeedBlob("I", I)
        workspace.FeedBlob("L", L)

        workspace.RunNetOnce(ref_c2_net)
        ref_c2_out = workspace.FetchBlob("ref_out")

        workspace.RunNetOnce(fp16_c2_net)
        fp16_c2_out = workspace.FetchBlob("fp16_out")

        np.testing.assert_allclose(fp16_c2_out,
                                   ref_c2_out,
                                   atol=1e-3,
                                   rtol=1e-3)

        workspace.RunNetOnce(onnxified_net)
        fp16_glow_out = workspace.FetchBlob("glow_out")

        if not np.allclose(fp16_glow_out, fp16_c2_out):
            diff = np.abs(fp16_glow_out - fp16_c2_out)
            print_test_debug_info(
                "sls",
                {
                    "indices": I,
                    "data": D,
                    "lengths": L,
                    "Y_c2": fp16_c2_out,
                    "Y_glow": fp16_glow_out,
                    "diff": diff,
                    "rowwise_diff": diff[:, 0],
                },
            )
            assert 0
    def test_net_conversion_and_append_net(self):
        other = model_helper.ModelHelper()
        fc1 = brew.fc(other,
                      "data",
                      "other_fc1",
                      dim_in=3 * 227 * 227,
                      dim_out=10)
        fc2 = brew.fc(other, fc1, "other_fc2", dim_in=10, dim_out=10)
        brew.fc(other, fc2, "other_fc3", dim_in=10, dim_out=10)

        def add_input_ops(model):
            model.net.UniformFill([], ["data"], shape=[4, 227, 227, 3])
            model.net.UniformFill([], ["label"], shape=[4])

        def add_model_ops(model, loss_scale):
            model.NHWC2NCHW("data", "data_nchw")
            model.Conv("data_nchw",
                       'conv1',
                       3,
                       64,
                       weight_init=("MSRAFill", {}),
                       kernel=7,
                       stride=2,
                       pad=3,
                       no_bias=0)
            model.SpatialBN('conv1',
                            'conv1_spatbn_relu',
                            64,
                            epsilon=1e-3,
                            is_test=False)
            model.Relu('conv1_spatbn_relu', 'conv1_spatbn_relu')
            model.MaxPool('conv1_spatbn_relu', 'pool1', kernel=3, stride=2)
            model.FC('pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=10)

            # Append the net and param_init_net of the other model
            appendnet = data_parallel_model.ConvertNetForDevice(other.net)
            model.net.AppendNet(appendnet)

            model.param_init_net.AppendNet(
                data_parallel_model.ConvertNetForDevice(other.param_init_net))

            model.Sigmoid('fc', 'fc_sigm')
            model.Softmax('fc_sigm', 'softmax')
            loss = model.AveragedLoss('softmax', 'loss')
            return [loss]

        def add_optimizer(model):
            optimizer.build_sgd(model, 0.1, policy="fixed", momentum=0.9)

        model = cnn.CNNModelHelper(
            order="NCHW",
            name="test",
        )
        data_parallel_model.Parallelize_CPU(
            model,
            input_builder_fun=add_input_ops,
            forward_pass_builder_fun=add_model_ops,
            optimizer_builder_fun=add_optimizer,
            devices=range(4))

        # Just create and run net and confirm no exception is thrown
        workspace.RunNetOnce(model.param_init_net)
        workspace.CreateNet(model.net)
        workspace.RunNet(model.net)
Beispiel #18
0
    def assertReferenceChecks(
        self,
        device_option,
        op,
        inputs,
        reference,
        input_device_options=None,
        threshold=1e-4,
        output_to_grad=None,
        grad_reference=None,
        atol=None,
        outputs_to_check=None,
    ):
        """
        This runs the reference Python function implementation
        (effectively calling `reference(*inputs)`, and compares that
        to the output of output, with an absolute/relative tolerance
        given by the `threshold` parameter.

        Useful for checking the implementation matches the Python
        (typically NumPy) implementation of the same functionality.

        Usage example:

            @given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs)
            def test_softsign(self, X, inplace, gc, dc):
                op = core.CreateOperator(
                    "Softsign", ["X"], ["X" if inplace else "Y"])

                def softsign(X):
                    return (X / (1 + np.abs(X)),)

                self.assertReferenceChecks(gc, op, [X], softsign)
        """
        if input_device_options is None:
            input_device_options = {}

        op = copy.deepcopy(op)
        op.device_option.CopyFrom(device_option)

        with temp_workspace():
            for (n, b) in zip(op.input, inputs):
                workspace.FeedBlob(
                    n,
                    b,
                    device_option=input_device_options.get(n, device_option)
                )
                print("Input", n, input_device_options.get(n, device_option))
            net = core.Net("opnet")
            net.Proto().op.extend([op])
            test_shape_inference = False
            try:
                (shapes, types) = workspace.InferShapesAndTypes([net])
                test_shape_inference = True
            except RuntimeError as e:
                # Temporarily catch runtime errors when inferring shape
                # and type info
                logging.warning(str(e))
                if os.getenv('CAFFE2_ASSERT_SHAPEINFERENCE') == '1':
                    raise e
            workspace.RunNetOnce(net)
            reference_outputs = reference(*inputs)
            if not (isinstance(reference_outputs, tuple) or
                    isinstance(reference_outputs, list)):
                raise RuntimeError(
                    "You are providing a wrong reference implementation. A "
                    "proper one should return a tuple/list of numpy arrays.")
            if not outputs_to_check:
                self.assertEqual(len(reference_outputs), len(op.output))
                outputs_to_check = list(range(len(op.output)))
            outs = []
            for (output_index, ref) in zip(outputs_to_check, reference_outputs):
                output_blob_name = op.output[output_index]
                output = workspace.FetchBlob(output_blob_name)
                if output.dtype.kind in ('S', 'O'):
                    np.testing.assert_array_equal(output, ref)
                else:
                    if atol is None:
                        atol = threshold
                    np.testing.assert_allclose(
                        output, ref, atol=atol, rtol=threshold,
                        err_msg=(
                            'Output {0} is not matching the reference'.format(
                                output_blob_name,
                            )),
                    )
                if test_shape_inference:
                    self._assertInferTensorChecks(
                        output_blob_name, shapes, types, output)
                outs.append(output)
            if grad_reference and output_to_grad:
                with core.DeviceScope(device_option):
                    self._assertGradReferenceChecks(
                        op, inputs, reference_outputs,
                        output_to_grad, grad_reference)
            return outs
    def run_model(self, devices, gpu):
        '''
        Helper function for test_equiv
        '''
        def input_builder_fun(model):
            return None

        def model_build_fun(model, loss_scale):
            fc = model.FC("data", "fc", 16, 1, ("ConstantFill", {}),
                          ("ConstantFill", {}))
            fc_fl = model.FlattenToVec(fc, "fc_fl")
            sigm = model.Sigmoid(fc_fl, "sigm")
            sq = model.SquaredL2Distance([sigm, "label"], "sq")
            loss = model.AveragedLoss(sq, "loss")
            loss = model.Scale(loss, scale=loss_scale)

            # For testing explicit sync
            model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
            return [loss]

        def add_optimizer(model):
            return optimizer.build_sgd(
                model,
                0.1,
                policy="fixed",
                max_gradient_norm=5.0,
                allow_lr_injection=True,
            )

        workspace.ResetWorkspace()
        model = cnn.CNNModelHelper(
            order="NHWC",
            name="test{}".format(devices),
        )
        data_parallel_model.Parallelize(
            model,
            input_builder_fun=input_builder_fun,
            forward_pass_builder_fun=model_build_fun,
            optimizer_builder_fun=add_optimizer,
            devices=devices,
            cpu_device=not gpu,
            shared_model=not gpu,
        )
        data_parallel_model.AddBlobSync(model, ["sync_num"])

        np.random.seed(2603)

        # Each run has same input, independent of number of gpus
        batch_size = 64
        for i in range(0, 10):
            full_data = np.random.rand(batch_size, 16)
            full_labels = np.round(full_data[:, 0])
            batch_per_device = batch_size // len(devices)

            for (j, g) in enumerate(devices):
                st = j * batch_per_device
                en = st + batch_per_device
                data = full_data[st:en, :].astype(np.float32)
                labels = full_labels[st:en].astype(np.float32)
                with core.DeviceScope(core.DeviceOption(model._device_type,
                                                        g)):
                    workspace.FeedBlob(
                        "{}_{}/data".format(model._device_prefix, g), data)
                    workspace.FeedBlob(
                        "{}_{}/label".format(model._device_prefix, g), labels)

            if i == 0:
                workspace.RunNetOnce(model.param_init_net)
                workspace.CreateNet(model.net)

            workspace.FeedBlob(model._device_prefix + "_0/sync_num",
                               np.array([i * 2]).astype(np.float32),
                               device_option=core.DeviceOption(
                                   model._device_type, 0))
            workspace.RunNet(model.net.Proto().name)

            # Test AddBlobSync
            for j in model._devices:
                sync = workspace.FetchBlob(model._device_prefix +
                                           "_{}/sync_num".format(j))[0]
                self.assertTrue(abs(sync - i * 2) < 0.01)

        return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
Beispiel #20
0
    def export_actor(
        cls,
        trainer,
        state_normalization_parameters,
        action_feature_ids,
        min_action_range_tensor_serving,
        max_action_range_tensor_serving,
        model_on_gpu=False,
    ):
        """Export caffe2 preprocessor net and pytorch actor forward pass as one
        caffe2 net.

        :param trainer DDPGTrainer
        :param state_normalization_parameters state NormalizationParameters
        :param min_action_range_tensor_serving pytorch tensor that specifies
            min action value for each dimension
        :param max_action_range_tensor_serving pytorch tensor that specifies
            min action value for each dimension
        :param state_normalization_parameters state NormalizationParameters
        :param model_on_gpu boolean indicating if the model is a GPU model or CPU model
        """
        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        C2.set_model(model)
        parameters: List[str] = []

        workspace.FeedBlob("input/float_features.lengths",
                           np.zeros(1, dtype=np.int32))
        workspace.FeedBlob("input/float_features.keys",
                           np.zeros(1, dtype=np.int64))
        workspace.FeedBlob("input/float_features.values",
                           np.zeros(1, dtype=np.float32))

        input_feature_lengths = "input_feature_lengths"
        input_feature_keys = "input_feature_keys"
        input_feature_values = "input_feature_values"

        C2.net().Copy(["input/float_features.lengths"],
                      [input_feature_lengths])
        C2.net().Copy(["input/float_features.keys"], [input_feature_keys])
        C2.net().Copy(["input/float_features.values"], [input_feature_values])

        preprocessor = PreprocessorNet()
        sparse_to_dense_processor = Caffe2SparseToDenseProcessor()
        sorted_features, _ = sort_features_by_normalization(
            state_normalization_parameters)
        state_dense_matrix, new_parameters = sparse_to_dense_processor(
            sorted_features,
            StackedAssociativeArray(input_feature_lengths, input_feature_keys,
                                    input_feature_values),
        )
        parameters.extend(new_parameters)
        state_normalized_dense_matrix, new_parameters = preprocessor.normalize_dense_matrix(
            state_dense_matrix,
            sorted_features,
            state_normalization_parameters,
            "state_norm",
            False,
        )
        parameters.extend(new_parameters)

        torch_init_net, torch_predict_net, new_parameters, actor_input_blob, actor_output_blob, min_action_training_blob, max_action_training_blob, min_action_serving_blob, max_action_serving_blob = DDPGPredictor.generate_train_net(
            trainer,
            model,
            min_action_range_tensor_serving,
            max_action_range_tensor_serving,
            model_on_gpu,
        )
        parameters.extend(new_parameters)
        net.Copy([state_normalized_dense_matrix], [actor_input_blob])

        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(torch_init_net)

        net.AppendNet(torch_predict_net)

        # Scale actors actions from [-1, 1] to serving range
        prev_range = C2.Sub(max_action_training_blob, min_action_training_blob)
        new_range = C2.Sub(max_action_serving_blob, min_action_serving_blob)
        subtract_prev_min = C2.Sub(actor_output_blob, min_action_training_blob)
        div_by_prev_range = C2.Div(subtract_prev_min, prev_range)
        scaled_for_serving_actions = C2.Add(
            C2.Mul(div_by_prev_range, new_range), min_action_serving_blob)

        output_lengths = "output/float_features.lengths"
        workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32))
        C2.net().ConstantFill(
            [C2.FlattenToVec(C2.ArgMax(actor_output_blob))],
            [output_lengths],
            value=trainer.actor.layers[-1].out_features,
            dtype=caffe2_pb2.TensorProto.INT32,
        )

        action_feature_ids_blob = C2.NextBlob("action_feature_ids")
        workspace.FeedBlob(action_feature_ids_blob,
                           np.array(action_feature_ids, dtype=np.int64))
        parameters.append(action_feature_ids_blob)

        output_keys = "output/float_features.keys"
        workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64))
        num_examples, _ = C2.Reshape(C2.Size("input/float_features.lengths"),
                                     shape=[1])
        C2.net().Tile([action_feature_ids_blob, num_examples], [output_keys],
                      axis=1)

        output_values = "output/float_features.values"
        workspace.FeedBlob(output_values, np.zeros(1, dtype=np.float32))
        C2.net().FlattenToVec([scaled_for_serving_actions], [output_values])

        workspace.CreateNet(net)
        return DDPGPredictor(net, torch_init_net, parameters)
    def test_parallelize_gpu_bmuf(self):
        model = cnn.CNNModelHelper(order="NHWC", name="test")
        gpu_ids = [0, 1]

        def input_builder_fun(model):
            return None

        self._generate_data(gpu_ids)

        data_parallel_model.Parallelize_GPU_BMUF(
            model,
            input_builder_fun,
            self._model_build_fun,
            self._param_update_fun,
            devices=gpu_ids,
        )

        data_parallel_model.RunInitNet(model)

        # Check initial momentum params are zeros
        self.assertEqual(list(viewkeys(model._device_grouped_blobs)),
                         ['fc_w', 'fc_b'])
        self.assertEqual(workspace.FetchBlob('gpu_0/fc_b_v'), 0)
        np.testing.assert_equal(workspace.FetchBlob('gpu_0/fc_w_v'),
                                np.zeros(16).astype(np.float32).reshape(1, 16))

        # Run the algorithm for one iteration to have non-zero params.
        data_parallel_model.RunNet(model, 1)

        # Save iteration momentum and post local update params
        v_b_ = workspace.FetchBlob('gpu_0/fc_b_v')
        v_w_ = workspace.FetchBlob('gpu_0/fc_w_v')

        workspace.RunNetOnce(model.net)

        b_0_ = workspace.FetchBlob('gpu_0/fc_b')
        w_0_ = workspace.FetchBlob('gpu_0/fc_w')
        b_1_ = workspace.FetchBlob('gpu_1/fc_b')
        w_1_ = workspace.FetchBlob('gpu_1/fc_w')

        # Compute block gradients.
        b_g_ = workspace.FetchBlob('gpu_0/fc_b_g')
        w_g_ = workspace.FetchBlob('gpu_0/fc_w_g')
        workspace.RunNetOnce(model._global_model_param_updates_net)

        g_b = (b_0_ + b_1_) / 2 - b_g_
        g_w = (w_0_ + w_1_) / 2 - w_g_
        v_b = workspace.FetchBlob('gpu_0/fc_b_v')
        v_w = workspace.FetchBlob('gpu_0/fc_w_v')

        w_g = workspace.FetchBlob('gpu_0/fc_w_g')
        b_g = workspace.FetchBlob('gpu_0/fc_b_g')
        w_0 = workspace.FetchBlob('gpu_0/fc_w')
        b_0 = workspace.FetchBlob('gpu_0/fc_b')
        w_1 = workspace.FetchBlob('gpu_1/fc_w')
        b_1 = workspace.FetchBlob('gpu_1/fc_b')

        # Check momentum update step
        np.testing.assert_equal(v_b, 0.5 * v_b_ + g_b)
        np.testing.assert_equal(v_w, 0.5 * v_w_ + g_w)

        np.testing.assert_equal(w_g, w_0)
        np.testing.assert_equal(w_g, w_1)
        np.testing.assert_equal(b_g, b_0)
        np.testing.assert_equal(b_g, b_1)

        # Check params update step
        np.testing.assert_equal(w_0, w_g_ + v_w)
        np.testing.assert_equal(b_0, b_g_ + v_b)
    def export(
        cls,
        trainer,
        state_normalization_parameters,
        action_normalization_parameters,
        int_features=False,
        model_on_gpu=False,
    ):
        """Export caffe2 preprocessor net and pytorch DQN forward pass as one
        caffe2 net.

        :param trainer ParametricDQNTrainer
        :param state_normalization_parameters state NormalizationParameters
        :param action_normalization_parameters action NormalizationParameters
        :param int_features boolean indicating if int features blob will be present
        :param model_on_gpu boolean indicating if the model is a GPU model or CPU model
        """

        input_dim = trainer.num_features
        buffer = PytorchCaffe2Converter.pytorch_net_to_buffer(
            trainer.q_network, input_dim, model_on_gpu
        )
        qnet_input_blob, qnet_output_blob, caffe2_netdef = PytorchCaffe2Converter.buffer_to_caffe2_netdef(
            buffer
        )
        torch_workspace = caffe2_netdef.workspace

        parameters = torch_workspace.Blobs()
        for blob_str in parameters:
            workspace.FeedBlob(blob_str, torch_workspace.FetchBlob(blob_str))

        torch_init_net = core.Net(caffe2_netdef.init_net)
        torch_predict_net = core.Net(caffe2_netdef.predict_net)

        # ensure state and action IDs have no intersection
        assert (
            len(
                set(state_normalization_parameters.keys())
                & set(action_normalization_parameters.keys())
            )
            == 0
        )

        model = model_helper.ModelHelper(name="predictor")
        net = model.net
        C2.set_model(model)

        workspace.FeedBlob("input/float_features.lengths", np.zeros(1, dtype=np.int32))
        workspace.FeedBlob("input/float_features.keys", np.zeros(1, dtype=np.int64))
        workspace.FeedBlob("input/float_features.values", np.zeros(1, dtype=np.float32))

        input_feature_lengths = "input_feature_lengths"
        input_feature_keys = "input_feature_keys"
        input_feature_values = "input_feature_values"

        if int_features:
            workspace.FeedBlob(
                "input/int_features.lengths", np.zeros(1, dtype=np.int32)
            )
            workspace.FeedBlob("input/int_features.keys", np.zeros(1, dtype=np.int64))
            workspace.FeedBlob("input/int_features.values", np.zeros(1, dtype=np.int32))
            C2.net().Cast(
                ["input/int_features.values"],
                ["input/int_features.values_float"],
                dtype=caffe2_pb2.TensorProto.FLOAT,
            )
            C2.net().MergeMultiScalarFeatureTensors(
                [
                    "input/float_features.lengths",
                    "input/float_features.keys",
                    "input/float_features.values",
                    "input/int_features.lengths",
                    "input/int_features.keys",
                    "input/int_features.values_float",
                ],
                [input_feature_lengths, input_feature_keys, input_feature_values],
            )
        else:
            C2.net().Copy(["input/float_features.lengths"], [input_feature_lengths])
            C2.net().Copy(["input/float_features.keys"], [input_feature_keys])
            C2.net().Copy(["input/float_features.values"], [input_feature_values])

        preprocessor = PreprocessorNet(clip_anomalies=True)

        state_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix(
            input_feature_lengths,
            input_feature_keys,
            input_feature_values,
            state_normalization_parameters,
            "state_norm",
            False,
            False,
        )
        parameters.extend(new_parameters)
        action_normalized_dense_matrix, new_parameters = preprocessor.normalize_sparse_matrix(
            input_feature_lengths,
            input_feature_keys,
            input_feature_values,
            action_normalization_parameters,
            "action_norm",
            False,
            False,
        )
        parameters.extend(new_parameters)
        state_action_normalized = "state_action_normalized"
        state_action_normalized_dim = "state_action_normalized_dim"
        net.Concat(
            [state_normalized_dense_matrix, action_normalized_dense_matrix],
            [state_action_normalized, state_action_normalized_dim],
            axis=1,
        )

        net.Copy([state_action_normalized], [qnet_input_blob])

        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(torch_init_net)

        net.AppendNet(torch_predict_net)

        new_parameters, q_values = RLPredictor._forward_pass(
            model, trainer, state_action_normalized, ["Q"], qnet_output_blob
        )
        parameters.extend(new_parameters)

        flat_q_values_key = (
            "output/string_weighted_multi_categorical_features.values.values"
        )
        num_examples, _ = C2.Reshape(C2.Size(flat_q_values_key), shape=[1])
        q_value_blob, _ = C2.Reshape(flat_q_values_key, shape=[1, -1])

        # Get 1 x n (number of examples) action index tensor under the max_q policy
        max_q_act_idxs = "max_q_policy_actions"
        C2.net().FlattenToVec([C2.ArgMax(q_value_blob)], [max_q_act_idxs])
        max_q_act_blob = C2.Tile(max_q_act_idxs, num_examples, axis=0)

        # Get 1 x n (number of examples) action index tensor under the softmax policy
        temperature = C2.NextBlob("temperature")
        parameters.append(temperature)
        workspace.FeedBlob(
            temperature, np.array([trainer.rl_temperature], dtype=np.float32)
        )
        tempered_q_values = C2.Div(q_value_blob, temperature, broadcast=1)
        softmax_values = C2.Softmax(tempered_q_values)
        softmax_act_idxs_nested = "softmax_act_idxs_nested"
        C2.net().WeightedSample([softmax_values], [softmax_act_idxs_nested])
        softmax_act_blob = C2.Tile(
            C2.FlattenToVec(softmax_act_idxs_nested), num_examples, axis=0
        )

        # Concat action idx vecs to get 2 x n tensor [[a_maxq, ..], [a_softmax, ..]]
        # transpose & flatten to get [a_maxq, a_softmax, a_maxq, a_softmax, ...]
        max_q_act_blob = C2.Cast(max_q_act_blob, to=caffe2_pb2.TensorProto.INT64)
        softmax_act_blob = C2.Cast(softmax_act_blob, to=caffe2_pb2.TensorProto.INT64)
        max_q_act_blob_nested, _ = C2.Reshape(max_q_act_blob, shape=[1, -1])
        softmax_act_blob_nested, _ = C2.Reshape(softmax_act_blob, shape=[1, -1])
        C2.net().Append(
            [max_q_act_blob_nested, softmax_act_blob_nested], [max_q_act_blob_nested]
        )
        transposed_action_idxs = C2.Transpose(max_q_act_blob_nested)
        flat_transposed_action_idxs = C2.FlattenToVec(transposed_action_idxs)
        output_values = "output/int_single_categorical_features.values"
        workspace.FeedBlob(output_values, np.zeros(1, dtype=np.int64))
        C2.net().Copy([flat_transposed_action_idxs], [output_values])

        output_lengths = "output/int_single_categorical_features.lengths"
        workspace.FeedBlob(output_lengths, np.zeros(1, dtype=np.int32))
        C2.net().ConstantFill(
            [flat_q_values_key],
            [output_lengths],
            value=2,
            dtype=caffe2_pb2.TensorProto.INT32,
        )

        output_keys = "output/int_single_categorical_features.keys"
        workspace.FeedBlob(output_keys, np.zeros(1, dtype=np.int64))
        output_keys_tensor, _ = C2.Concat(
            C2.ConstantFill(shape=[1, 1], value=0, dtype=caffe2_pb2.TensorProto.INT64),
            C2.ConstantFill(shape=[1, 1], value=1, dtype=caffe2_pb2.TensorProto.INT64),
            axis=0,
        )
        output_key_tile = C2.Tile(output_keys_tensor, num_examples, axis=0)
        C2.net().FlattenToVec([output_key_tile], [output_keys])

        workspace.CreateNet(net)
        return ParametricDQNPredictor(net, parameters, int_features)
Beispiel #23
0
 def GetLoss(new_value):
     workspace.blobs[input_to_check] = new_value
     workspace.RunNetOnce(net_copy)
     return sum(
         [workspace.blobs[output] for output in outputs_with_grad])
Beispiel #24
0
    def test_dataset_ops(self):
        """
        1. Defining the schema of our dataset.

        This example schema could represent, for example, a search query log.
        """
        schema = Struct(
            # fixed size vector, which will be stored as a matrix when batched
            ('dense', Scalar((np.float32, 3))),
            # could represent a feature map from feature ID to float value
            ('floats', Map(Scalar(np.int32), Scalar(np.float32))),
            # could represent a multi-valued categorical feature map
            ('int_lists', Map(
                Scalar(np.int32),
                List(Scalar(np.int64)),
            )),
            # could represent a multi-valued, weighted categorical feature map
            ('id_score_pairs',
             Map(
                 Scalar(np.int32),
                 Map(Scalar(np.int64),
                     Scalar(np.float32),
                     keys_name='ids',
                     values_name='scores'),
             )),
            # additional scalar information
            ('metadata',
             Struct(
                 ('user_id', Scalar(np.int64)),
                 ('user_embed', Scalar((np.float32, 2))),
                 ('query', Scalar(str)),
             )),
        )
        """
        This is what the flattened fields for this schema look like, along
        with its type. Each one of these fields will be stored, read and
        writen as a tensor.
        """
        expected_fields = [
            ('dense', (np.float32, 3)),
            ('floats:lengths', np.int32),
            ('floats:values:keys', np.int32),
            ('floats:values:values', np.float32),
            ('int_lists:lengths', np.int32),
            ('int_lists:values:keys', np.int32),
            ('int_lists:values:values:lengths', np.int32),
            ('int_lists:values:values:values', np.int64),
            ('id_score_pairs:lengths', np.int32),
            ('id_score_pairs:values:keys', np.int32),
            ('id_score_pairs:values:values:lengths', np.int32),
            ('id_score_pairs:values:values:values:ids', np.int64),
            ('id_score_pairs:values:values:values:scores', np.float32),
            ('metadata:user_id', np.int64),
            ('metadata:user_embed', (np.float32, 2)),
            ('metadata:query', str),
        ]
        zipped = zip(expected_fields, schema.field_names(),
                     schema.field_types())
        for (ref_name, ref_type), name, dtype in zipped:
            self.assertEquals(ref_name, name)
            self.assertEquals(np.dtype(ref_type), dtype)
        """
        2. The contents of our dataset.

        Contents as defined below could represent, for example, a log of
        search queries along with dense, sparse features and metadata.
        The datset below has 3 top-level entries.
        """
        contents_raw = [
            # dense
            [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]],
            # floats
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
            # int lists
            [2, 0, 1],  # len
            [11, 12, 31],  # key
            [2, 4, 3],  # value:len
            [111, 112, 121, 122, 123, 124, 311, 312, 313],  # value:value
            # id score pairs
            [1, 2, 2],  # len
            [11, 21, 22, 31, 32],  # key
            [1, 1, 2, 2, 3],  # value:len
            [111, 211, 221, 222, 311, 312, 321, 322, 323],  # value:ids
            [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2,
             32.3],  # val:score
            # metadata
            [123, 234, 456],  # user_id
            [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]],  # user_embed
            ['dog posts', 'friends who like to', 'posts about ca'],  # query
        ]
        # convert the above content to ndarrays, checking against the schema
        contents = from_blob_list(schema, contents_raw)
        """
        3. Creating and appending to the dataset.
        We first create an empty dataset with the given schema.
        Then, a Writer is used to append these entries to the dataset.
        """
        ds = dataset.Dataset(schema)
        net = core.Net('init')
        ds.init_empty(net)

        content_blobs = NewRecord(net, contents)
        FeedRecord(content_blobs, contents)
        writer = ds.writer(init_net=net)
        writer.write_record(net, content_blobs)
        workspace.RunNetOnce(net)
        """
        4. Iterating through the dataset contents.

        If we were to iterate through the top level entries of our dataset,
        this is what we should expect to see:
        """
        entries_raw = [
            (
                [[1.1, 1.2, 1.3]],  # dense
                [1],
                [11],
                [1.1],  # floats
                [2],
                [11, 12],
                [2, 4],
                [111, 112, 121, 122, 123, 124],  # intlst
                [1],
                [11],
                [1],
                [111],
                [11.1],  # id score pairs
                [123],
                [[0.2, 0.8]],
                ['dog posts'],  # metadata
            ),
            (
                [[2.1, 2.2, 2.3]],  # dense
                [2],
                [21, 22],
                [2.1, 2.2],  # floats
                [0],
                [],
                [],
                [],  # int list
                [2],
                [21, 22],
                [1, 2],
                [211, 221, 222],
                [21.1, 22.1, 22.2],
                [234],
                [[0.5, 0.5]],
                ['friends who like to'],  # metadata
            ),
            (
                [[3.1, 3.2, 3.3]],  # dense
                [3],
                [31, 32, 33],
                [3.1, 3.2, 3.3],  # floats
                [1],
                [31],
                [3],
                [311, 312, 313],  # int lst
                [2],
                [31, 32],
                [2, 3],
                [311, 312, 321, 322, 323],
                [31.1, 31.2, 32.1, 32.2, 32.3],  # id score list
                [456],
                [[0.7, 0.3]],
                ['posts about ca'],  # metadata
            ),
            # after the end of the dataset, we will keep getting empty vectors
            (
                [], ) * 16,
            ([], ) * 16,
        ]
        entries = [from_blob_list(schema, e) for e in entries_raw]
        """
        Let's go ahead and create the reading nets.
        We will run `read` net multiple times and assert that we are reading the
        entries the way we stated above.
        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')
        reader = ds.reader(read_init_net)
        should_continue, batch = reader.read_record(read_next_net)

        workspace.RunNetOnce(read_init_net)
        workspace.CreateNet(read_next_net)

        for i, entry in enumerate(entries):
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        """
        5. Reading/writing in a single plan

        If all of operations on the data are expressible as Caffe2 operators,
        we don't need to load the data to python, iterating through the dataset
        in a single Plan.

        Where we will process the dataset a little and store it in a second
        dataset. We can reuse the same Reader since it supports reset.
        """
        reset_net = core.Net('reset_net')
        reader.reset(reset_net)
        read_step, batch = reader.execution_step()
        """ We will add the line number * 1000 to the feature ids. """
        process_net = core.Net('process')
        line_no = Const(process_net, 0, dtype=np.int32)
        const_one = Const(process_net, 1000, dtype=np.int32)
        process_net.Add([line_no, const_one], [line_no])
        field = batch.floats.keys.get()
        process_net.Print(field, [])
        process_net.Add([field, line_no], field, broadcast=1, axis=0)
        """ Lets create a second dataset and append to it. """
        ds2 = dataset.Dataset(schema, name='dataset2')
        ds2.init_empty(reset_net)
        writer = ds2.writer(reset_net)
        writer.write_record(process_net, batch)
        # commit is not necessary for DatasetWriter but will add it for
        # generality of the example
        commit_net = core.Net('commit')
        writer.commit(commit_net)
        """ Time to create and run a plan which will do the processing """
        plan = core.Plan('process')
        plan.AddStep(core.execution_step('reset', reset_net))
        plan.AddStep(read_step.AddNet(process_net))
        plan.AddStep(core.execution_step('commit', commit_net))
        workspace.RunPlan(plan)
        """
        Now we should have dataset2 populated.
        """
        ds2_data = FetchRecord(ds2.content())
        field = ds2_data.floats.keys
        field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000])
        _assert_records_equal(contents, ds2_data)
        """
        6. Slicing a dataset

        You can create a new schema from pieces of another schema and reuse
        the same data.
        """
        subschema = Struct(('top_level', schema.int_lists.values))
        int_list_contents = contents.int_lists.values.field_names()
        self.assertEquals(len(subschema.field_names()), len(int_list_contents))
        """
        7. Random Access a dataset

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob)
        reader.computeoffset(read_init_net)

        should_continue, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net)
        read_next_net_name = str(read_next_net)

        for i in range(len(entries)):
            k = idx[i] if i in idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        """
        8. Sort and shuffle a dataset

        This sort the dataset using the score of a certain column,
        and then shuffle within each chunk of size batch_size * shuffle_size
        before shuffling the chunks.

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        reader = ds.random_reader(read_init_net)
        reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2)
        reader.computeoffset(read_init_net)

        should_continue, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net)

        expected_idx = np.array([2, 1, 0])
        for i in range(len(entries)):
            k = expected_idx[i] if i in expected_idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
Beispiel #25
0
    def testPartialClone(self):
        params = core.Net('params')
        p1 = params.ConstantFill([], ['p1'])
        workspace.CreateNet(params)
        workspace.RunNetOnce(params)

        n = core.Net('original')
        a1 = n.AddExternalInput('a1')
        a2 = n.AddExternalInput('a2')
        b1, b2 = n.Concat([a1, a2], ['b1', 'b2'], axis=0)
        c1 = n.Sum([b1, p1], ['c1'])
        c2 = n.Sum([b2], ['c2'])
        d = n.Sum([c1, c2], ['d'])

        # test that gradient ops are ignored when partial-cloning
        n.AddGradientOperators([d])

        # test some in-place ops
        k = n.Sum([p1], ['k'])
        e = n.Sum([d], ['e'])
        e = n.Sum([e, k], [e])
        e = n.Sum([e], [e])
        f = n.Sum(e, ['f'])

        def net_assert(net, num_ops, inputs, outputs, internals):
            self.assertEqual(len(net.Proto().op), num_ops)
            self.assertEqual(set(net.Proto().external_input), inputs)
            self.assertEqual(set(net.Proto().external_output), outputs)
            all_blobs = set(net.Proto().external_input)
            all_blobs |= set(net.Proto().external_output)
            for op in net.Proto().op:
                all_blobs |= set(op.input) | set(op.output)
            self.assertEqual(all_blobs, inputs | outputs | internals)
            # create net to make sure its valid
            for input in inputs:
                workspace.FeedBlob(input, np.array([]))
            workspace.CreateNet(net)

        n2, (d22, ) = n.ClonePartial('f1', {a1: 'a11', a2: 'a22'}, [d])
        net_assert(n2, 4, {'p1', 'a11', 'a22'}, {'f1/d'},
                   {'f1/b1', 'f1/b2', 'f1/c1', 'f1/c2', 'p1'})
        self.assertTrue(isinstance(d22, core.BlobReference))
        self.assertEqual(d22.Net(), n2)
        self.assertEqual(str(d22), 'f1/d')

        n3, (d22, ) = n.ClonePartial('f2', [b1, b2], [d])
        net_assert(n3, 3, {'p1', 'b1', 'b2'}, {'f2/d'},
                   {'f2/c1', 'f2/c2', 'p1'})
        self.assertEqual(str(d22), 'f2/d')

        n4, (c22, ) = n.ClonePartial('f3', [b1], [c1])
        net_assert(n4, 1, {'p1', 'b1'}, {'f3/c1'}, {'p1'})
        self.assertEqual(str(c22), 'f3/c1')

        n5, (c11, c22) = n.ClonePartial('f4', [b1, b2], [c1, c2])
        net_assert(n5, 2, {'p1', 'b1', 'b2'}, {'f4/c1', 'f4/c2'}, {'p1'})
        self.assertEqual(str(c11), 'f4/c1')
        self.assertEqual(str(c22), 'f4/c2')

        with self.assertRaises(AssertionError):
            n.ClonePartial('f4', [a1, a2, c2], [d])

        n6, (e22, ) = n.ClonePartial('f5', [d], [e])
        net_assert(n6, 4, {'p1', 'd'}, {'f5/e'}, {'f5/k', 'p1'})
        self.assertEqual(str(e22), 'f5/e')

        n8, (e22, f22) = n.ClonePartial('f7', [d], [e, f])
        net_assert(n8, 5, {'p1', 'd'}, {'f7/e', 'f7/f'}, {'p1', 'f7/k'})
        self.assertEqual(str(e22), 'f7/e')
        self.assertEqual(str(f22), 'f7/f')

        params._CheckLookupTables()
        n._CheckLookupTables()
Beispiel #26
0
# Params that will be optimized during training
print "\n***************** OPT PARAM INFO *******************"
print my_model.GetOptimizationParamInfo()

#exit()

##################################################################################
# Run the training

# Initialization.
train_dataset = UCF11_Dataset(TRAIN_DICTIONARY)
for image, label in train_dataset.read(batch_size=1):
    workspace.FeedBlob("data", image)
    workspace.FeedBlob("label", label)
    break
workspace.RunNetOnce(my_model.param_init_net)
workspace.CreateNet(my_model.net, overwrite=True)

# Main loop.
batch_size = 20
print_freq = 1
losses = []
for epoch in range(5):
    for index, (image, label) in enumerate(train_dataset.read(batch_size)):
        workspace.FeedBlob("data", image)
        workspace.FeedBlob("label", label)
        workspace.RunNet(my_model.net)
        accuracy = float(workspace.FetchBlob("accuracy"))
        loss = workspace.FetchBlob("loss").mean()
        losses.append(loss)
        if index % print_freq == 0:
    def lstm(self,
             create_lstm,
             t,
             n,
             d,
             ref,
             outputs_with_grads,
             memory_optim,
             forget_bias=0.0):
        model = CNNModelHelper(name='external')
        input_blob, seq_lengths, hidden_init, cell_init = (
            model.net.AddExternalInputs('input_blob', 'seq_lengths',
                                        'hidden_init', 'cell_init'))

        create_lstm(
            model,
            input_blob,
            seq_lengths,
            (hidden_init, cell_init),
            d,
            d,
            scope="external/recurrent",
            outputs_with_grads=outputs_with_grads,
            memory_optimization=memory_optim,
            forget_bias=forget_bias,
        )

        op = model.net._net.op[-1]

        workspace.RunNetOnce(model.param_init_net)
        input_blob = op.input[0]

        def generate_random_state(n, d):
            ndim = int(np.random.choice(3, 1)) + 1
            if ndim == 1:
                return np.random.randn(1, n, d).astype(np.float32)
            random_state = np.random.randn(n, d).astype(np.float32)
            if ndim == 3:
                random_state = random_state.reshape([1, n, d])
            return random_state

        workspace.FeedBlob(str(input_blob),
                           np.random.randn(t, n, d * 4).astype(np.float32))
        workspace.FeedBlob("hidden_init", generate_random_state(n, d))
        workspace.FeedBlob("cell_init", generate_random_state(n, d))
        workspace.FeedBlob(
            "seq_lengths",
            np.random.randint(1, t + 1, size=(n, )).astype(np.int32))
        inputs = [workspace.FetchBlob(name) for name in op.input]

        self.assertReferenceChecks(
            hu.cpu_do,
            op,
            inputs,
            ref,
            outputs_to_check=range(4),
        )

        # Checking for input, gates_t_w and gates_t_b gradients
        for param in range(5):
            self.assertGradientChecks(
                device_option=hu.cpu_do,
                op=op,
                inputs=inputs,
                outputs_to_check=param,
                outputs_with_grads=outputs_with_grads,
                threshold=0.01,
                stepsize=0.005,
            )
def train_resnet50(args):
    # Model building functions
    def create_resnet50_model_ops(model, loss_scale=1.0):
        # Creates a residual network
        [softmax, loss] = resnet.create_resnet50(
            model,
            "data",
            num_input_channels=3,
            num_labels=1000,
            label="label",
        )
        prefix = model.net.Proto().name
        loss = model.net.Scale(loss, prefix + "_loss", scale=loss_scale)
        brew.accuracy(model, [softmax, "label"], prefix + "_accuracy")
        return [loss]

    # Create ModelHelper object
    train_arg_scope = {
        'order': 'NCHW',
        'use_gpu_engine': True,
    }
    train_model = model_helper.ModelHelper(name="resnet50",
                                           arg_scope=train_arg_scope)
    reader = train_model.CreateDB("train_reader",
                                  db=args.train_data,
                                  db_type="lmdb")

    def add_image_input_ops(model):
        # utilize the ImageInput operator to prep the images
        data, label = brew.image_input(
            model,
            reader,
            ["data", "label"],
            batch_size=args.batch_size,
            # mean: to remove color values that are common
            mean=128.,
            # std is going to be modified randomly to influence the mean subtraction
            std=128.,
            # scale to rescale each image to a common size
            scale=256,
            # crop to the square each image to exact dimensions
            crop=224,
            # not running in test mode
            is_test=False,
            # mirroring of the images will occur randomly
            mirror=1,
            use_caffe_datum=False,
        )
        # prevent back-propagation: optional performance improvement; may not be observable at small scale
        data = model.net.StopGradient(data, data)

    def add_parameter_update_ops(model):
        brew.add_weight_decay(model, weight_decay)
        iter = brew.iter(model, "iter")
        lr = model.net.LearningRate(
            [iter],
            "lr",
            base_lr=base_learning_rate,
            policy="step",
            stepsize=int(10 * args.epochs_size / args.batch_size),
            gamma=0.1,
        )
        for param in model.GetParams():
            param_grad = model.param_to_grad[param]
            param_momentum = model.param_init_net.ConstantFill([param],
                                                               param +
                                                               '_momentum',
                                                               value=0.0)

            # Update param_grad and param_momentum in place
            model.net.MomentumSGDUpdate(
                [param_grad, param_momentum, lr, param],
                [param_grad, param_momentum, param],
                # almost 100% but with room to grow
                momentum=0.9,
                # netsterov is a defenseman for the Montreal Canadiens, but
                # Nesterov Momentum works slightly better than standard momentum
                nesterov=1,
            )

    dpm.Parallelize_GPU(
        train_model,
        input_builder_fun=add_image_input_ops,
        forward_pass_builder_fun=create_resnet50_model_ops,
        param_update_builder_fun=add_parameter_update_ops,
        devices=gpus,
        optimize_gradient_memory=True,
        cpu_device=True,
    )
    workspace.RunNetOnce(train_model.param_init_net)
    workspace.CreateNet(train_model.net)
    #print(train_model.net.Proto())

    for epoch in range(args.num_epochs):
        num_iters = int(args.epochs_size / args.batch_size)
        for iter in range(num_iters):
            t1 = time.time()
            workspace.RunNet(train_model.net.Proto().name)
            t2 = time.time()
            dt = t2 - t1

            print(
                ("Finished iteration {:>" + str(len(str(num_iters))) + "}/{}" +
                 " (epoch {:>" + str(len(str(args.num_epochs))) + "}/{})" +
                 " ({:.2f} images/sec)").format(iter + 1, num_iters, epoch + 1,
                                                args.num_epochs,
                                                args.batch_size / dt))

            # Get the average accuracy for the training model
            train_accuracy = accuracy(train_model)
            print("Train accuracy: {:.3f}".format(train_accuracy))
    def test_stateful_convolution_forward_only(
        self,
        sequence_length,
        conv_window,
        batch_size,
        state_size,
    ):
        '''
        This unit test demonstrates another ways of using RecurrentNetwork.

        Imagine, that you want to compute convolution over a sequence,
        but sequence elements are not given to you from the beginning,
        so you have to loop over the sequence and compute convolution
        for each element separately. This situation can occur,
        during inference/generation step of the neural networks.

        First of all, you have to provide actual input via recurrent states,
        since the input of RecurrentNetwork should be known in advance.
        Here, we use `fake_inputs` as the input,
        and it's used by the op to extract batch size and sequence length.
        The actual input sequence is stored in the recurrent state
        `input_state`. At every step we generate a new element via input_state_t
        (in this example, input_state_t is generated at random, but
        in a real situation it can be created using convolution output
        from the previous step).

        A few important differences from regular RecurrentNetwork usecase:

        1. input_state_t_prev is not only a single previous element of
        input_state sequence. It is last conv_window elements including (!)
        the current one - input_state_t. We specify that using `link_window`
        argument of RecurrentNetwork. We need that many elements to
        compute a single convolution step. Also, note that `link_window`
        specifies how many element to link starting at
        `timestep` + `link_offset` position.

        2. First few steps might require additional zero padding from the left,
        since there is no enough element of input_state sequence are available.
        So the initial_state for input_state contains several elements
        (exactly how many pads we need for the first step). Also, because of
        that all offseting over input_state sequnece is being shifted
        by length of initial_input_state: see `link_offset` and `alias_offset`
        arguments of RecurrentNetwork.

        In this test, we assert that we get the same result
        if we apply convolution over all elements simultaneously,
        since the whole input_state sequence was generated at the end.
    '''
        model = CNNModelHelper(name='model')
        fake_inputs = model.param_init_net.UniformFill(
            [],
            'fake_inputs',
            min=-1.0,
            max=1.0,
            shape=[sequence_length, batch_size, state_size],
        )
        initial_input_state = model.param_init_net.ConstantFill(
            [],
            'initial_input_state',
            value=0.0,
            shape=[conv_window - 1, batch_size, state_size],
        )
        initial_output_state = model.param_init_net.ConstantFill(
            [],
            'initial_output_state',
            value=0.0,
            shape=[1, batch_size, state_size],
        )
        step_model = CNNModelHelper(name='step_model', param_model=model)
        (
            fake_input_t,
            timestep,
            input_state_t_prev,
        ) = step_model.net.AddExternalInputs(
            'fake_input_t',
            'timestep',
            'input_state_t_prev',
        )
        conv_filter = step_model.param_init_net.XavierFill(
            [],
            'conv_filter',
            shape=[state_size, 1, conv_window, state_size],
        )
        conv_bias = step_model.param_init_net.ConstantFill(
            [],
            'conv_bias',
            shape=[state_size],
            value=0.0,
        )
        step_model.params.extend([conv_filter, conv_bias])
        input_state_t = step_model.net.UniformFill(
            [],
            'input_state_t',
            min=-1.0,
            max=1.0,
            shape=[1, batch_size, state_size],
        )
        output_state_t = self._convolution_1d(
            model=step_model,
            inputs=input_state_t_prev,
            conv_window=conv_window,
            conv_filter=conv_filter,
            conv_bias=conv_bias,
            output_name='output_state_t',
            left_pad=False,
        )
        initial_recurrent_states = [initial_input_state, initial_output_state]
        all_inputs = ([fake_inputs] + step_model.params +
                      initial_recurrent_states)
        all_outputs = ['input_state_all', 'output_state_all']
        recurrent_states = ['input_state', 'output_state']
        input_state_all, output_state_all, _ = model.net.RecurrentNetwork(
            all_inputs,
            all_outputs + ['step_workspaces'],
            param=map(all_inputs.index, step_model.params),
            alias_src=recurrent_states,
            alias_dst=all_outputs,
            alias_offset=[conv_window - 1, 1],
            recurrent_states=recurrent_states,
            initial_recurrent_state_ids=map(
                all_inputs.index,
                initial_recurrent_states,
            ),
            link_internal=map(
                str,
                [input_state_t_prev, input_state_t, output_state_t],
            ),
            link_external=['input_state', 'input_state', 'output_state'],
            link_offset=[0, conv_window - 1, 1],
            link_window=[conv_window, 1, 1],
            backward_link_internal=[],
            backward_link_external=[],
            backward_link_offset=[],
            step_net=str(step_model.net.Proto()),
            backward_step_net='',
            timestep='timestep' if timestep is None else str(timestep),
            outputs_with_grads=[],
        )

        output_states_2 = self._convolution_1d(
            model=model,
            inputs=input_state_all,
            conv_window=conv_window,
            conv_filter=conv_filter,
            conv_bias=conv_bias,
            output_name='output_states_2',
            left_pad=True,
        )

        workspace.RunNetOnce(model.param_init_net)
        workspace.RunNetOnce(model.net)

        np.testing.assert_almost_equal(
            workspace.FetchBlob(output_state_all),
            workspace.FetchBlob(output_states_2),
        )
Beispiel #30
0
# - The instantiated net's Run() function is called
#
# Before we do anything, we should clear any earlier workspace variables with `ResetWorkspace()`.
#
# Then there are two ways to run a net from Python. We will do the first option in the example below.
#
# 1. Call `workspace.RunNetOnce()`, which instantiates, runs and immediately destructs the network
# 2. Call `workspace.CreateNet()` to create the C++ net object owned by the workspace, then call `workspace.RunNet()`, passing the name of the network to it
#
#

# In[23]:

workspace.ResetWorkspace()
print("Current blobs in the workspace: {}".format(workspace.Blobs()))
workspace.RunNetOnce(net)
print("Blobs in the workspace after execution: {}".format(workspace.Blobs()))
# Let's dump the contents of the blobs
for name in workspace.Blobs():
    print("{}:\n{}".format(name, workspace.FetchBlob(name)))

# Now let's try the second way to create the net, and run it. First, clear the variables with `ResetWorkspace()`. Then create the net with the workspace's `net` object that we created earlier using `CreateNet(net_object)`. Finally, run the net with `RunNet(net_name)`.

# In[24]:

workspace.ResetWorkspace()
print("Current blobs in the workspace: {}".format(workspace.Blobs()))
workspace.CreateNet(net)
workspace.RunNet(net.Proto().name)
print("Blobs in the workspace after execution: {}".format(workspace.Blobs()))
for name in workspace.Blobs():