Esempio n. 1
0
    def backward_value_helper(self, cond_func, use_cuda, use_parallel_exe):
        """
        Helper function that compares calculated backward value is close to dy/dx
        """
        main_program = Program()
        main_program.random_seed = 123
        startup_program = Program()
        startup_program.random_seed = 123
        with program_guard(main_program, startup_program):
            img = fluid.data(name='image', shape=[-1, 9], dtype='float32')
            img.stop_gradient = False
            label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
            i = fluid.data(name="i", shape=[1], dtype='int32')
            loss = cond_func(i, img, label)
            append_backward(loss)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_program)

        num_devices = 1
        if use_parallel_exe:
            os.environ['CPU_NUM'] = str(2)
            exe = fluid.ParallelExecutor(use_cuda=use_cuda,
                                         main_program=main_program,
                                         loss_name=loss.name)
            num_devices = exe.device_count

        delta = 0.005
        for feed_i in range(0, 10):
            feed_img = np.random.random(size=[1, 9]).astype(np.float32)
            feed_label = np.random.randint(low=0,
                                           high=10,
                                           size=[1, 1],
                                           dtype=np.int64)
            if use_parallel_exe:
                img_grad, loss_value = exe.run(
                    feed={
                        'i': np.full((num_devices), feed_i, np.int32),
                        'image': np.repeat(feed_img, num_devices, axis=0),
                        'label': np.repeat(feed_label, num_devices, axis=0)
                    },
                    fetch_list=[img.grad_name, loss.name])
            else:
                img_grad, loss_value = exe.run(
                    main_program,
                    feed={
                        'i': np.full((1), feed_i, np.int32),
                        'image': feed_img,
                        'label': feed_label
                    },
                    fetch_list=[img.grad_name, loss.name])

            numerical_grad = np.zeros(shape=[num_devices, 9], dtype=np.float32)
            feed_img_delta = np.copy(feed_img)
            for j in range(9):
                feed_img_delta[0][j] = feed_img[0][j] + delta
                if use_parallel_exe:
                    loss_delta = exe.run(feed={
                        'i':
                        np.full((num_devices), feed_i, np.int32),
                        'image':
                        np.repeat(feed_img_delta, num_devices, axis=0),
                        'label':
                        np.repeat(feed_label, num_devices, axis=0)
                    },
                                         fetch_list=[loss.name])
                    multi_device_grad = (loss_delta[0] -
                                         loss_value[0]) / delta / num_devices
                    for d in range(num_devices):
                        numerical_grad[d][j] = multi_device_grad[d]
                else:
                    loss_delta = exe.run(main_program,
                                         feed={
                                             'i': np.full((1), feed_i,
                                                          np.int32),
                                             'image': feed_img_delta,
                                             'label': feed_label
                                         },
                                         fetch_list=[loss.name])
                    numerical_grad[0][j] = (loss_delta[0] -
                                            loss_value[0]) / delta
                feed_img_delta[0][j] = feed_img[0][j]
            self.assertTrue(
                np.isclose(img_grad, numerical_grad, atol=0.05,
                           rtol=0.05).all())
    def get_pserver_program(self, endpoint):
        """
        Get parameter server side program.

        Args:
            endpoint (str): current parameter server endpoint.

        Returns:
            Program: the program for current parameter server to run.
        """
        # TODO(panyx0718): Revisit this assumption. what if #blocks > #pservers.
        # NOTE: assume blocks of the same variable is not distributed
        # on the same pserver, only change param/grad varnames for
        # trainers to fetch.
        sys.stderr.write(
            "get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.\n"
        )
        # step1
        pserver_program = Program()
        pserver_program.random_seed = self.origin_program.random_seed
        pserver_program._copy_dist_param_info_from(self.origin_program)

        # step2: Create vars to receive vars at parameter servers.
        recv_inputs = []
        for v in self.param_grad_ep_mapping[endpoint]["params"]:
            self._clone_var(pserver_program.global_block(), v)
        for v in self.param_grad_ep_mapping[endpoint]["opti"]:
            # create vars for each trainer in global scope, so
            # we don't need to create them when grad arrives.
            # change client side var name to origin name by
            # removing ".trainer_%d" suffix
            suff_idx = v.name.find(".opti.trainer_")
            if suff_idx >= 0:
                orig_var_name = v.name[:suff_idx]
            # NOTE: single_trainer_var must be created for multi-trainer
            # case to merge grads from multiple trainers
            single_trainer_var = pserver_program.global_block().var(
                orig_var_name)

            if self.sync_mode and self.trainer_num > 1:
                for trainer_id in range(self.trainer_num):
                    var = pserver_program.global_block().create_var(
                        name="%s.opti.trainer_%d" %
                        (orig_var_name, trainer_id),
                        persistable=False,
                        type=v.type,
                        dtype=v.dtype,
                        shape=v.shape)
                    recv_inputs.append(var)

        # step 3
        # Create a union-find data structure from optimize ops,
        # If two ops are connected, we could add these two ops
        # into one set.
        ufind = self._create_ufind(self.optimize_ops)
        # step 3.2
        # Iterate through the ops and append optimize op which
        # located on current pserver
        opt_op_on_pserver = []
        for _, op in enumerate(self.optimize_ops):
            if self._is_optimizer_op(op) and self._is_opt_op_on_pserver(
                    endpoint, op):
                opt_op_on_pserver.append(op)

        # step 3.4
        # Iterate through the ops, and if an op and the optimize ops
        # which located on current pserver are in one set, then
        # append it into the sub program.

        global_ops = []

        # sparse grad name to param name
        sparse_grad_to_param = []

        # append lr decay ops to the child block if exists
        lr_ops = self._get_lr_ops()
        # record optimize blocks and we can run them on pserver parallel
        opti_blocks = []

        # append op to the current block
        grad_to_block_id = []
        pre_block_idx = pserver_program.num_blocks - 1
        for idx, opt_op in enumerate(self._opti_var_list):
            per_opt_block = pserver_program._create_block(pre_block_idx)
            opti_blocks.append(per_opt_block)
            optimize_target_param_name = self._opti_to_param[opt_op]
            pserver_block = per_opt_block.program.global_block()
            # append grad merging ops before clip and weight decay
            # e.g. merge grad -> L2Decay op -> clip op -> optimize
            merged_var = pserver_block.vars[optimize_target_param_name]
            if self.sync_mode and self.trainer_num > 1:
                vars2merge = []
                for i in range(self.trainer_num):
                    per_trainer_name = "%s.opti.trainer_%d" % \
                                       (optimize_target_param_name, i)
                    vars2merge.append(pserver_block.vars[per_trainer_name])
                per_opt_block.append_op(type="sum",
                                        inputs={"X": vars2merge},
                                        outputs={"Out": merged_var},
                                        attrs={"use_mkldnn": False})
                per_opt_block.append_op(
                    type="scale",
                    inputs={"X": merged_var},
                    outputs={"Out": merged_var},
                    attrs={"scale": 1.0 / float(self.trainer_num)})

        # In some case, some parameter server will have no parameter to optimize
        # So we give an empty optimize block to parameter server.
        attrs = {
            "optimize_blocks": opti_blocks,
            "endpoint": endpoint,
            "Fanin": self.trainer_num,
            "sync_mode": self.sync_mode,
        }

        # step5 append the listen_and_serv op
        pserver_program.global_block().append_op(type="fl_listen_and_serv",
                                                 inputs={'X': recv_inputs},
                                                 outputs={},
                                                 attrs=attrs)

        pserver_program._sync_with_cpp()
        # save pserver program to generate pserver side startup relatively.
        self.pserver_program = pserver_program
        return pserver_program
    def get_startup_program(self,
                            endpoint,
                            pserver_program=None,
                            startup_program=None):
        """
        **Deprecated**

        Get startup program for current parameter server.
        Modify operator input variables if there are variables that
        were split to several blocks.

        Args:
            endpoint (str): current pserver endpoint.
            pserver_program (Program): deprecated, call get_pserver_program first.
            startup_program (Program): deprecated, should pass startup_program
                when initalizing

        Returns:
            Program: parameter server side startup program.
        """
        s_prog = Program()
        orig_s_prog = self.startup_program
        s_prog.random_seed = orig_s_prog.random_seed
        params = self.param_grad_ep_mapping[endpoint]["params"]

        def _get_splited_name_and_shape(varname):
            for idx, splited_param in enumerate(params):
                pname = splited_param.name
                if same_or_split_var(pname, varname) and varname != pname:
                    return pname, splited_param.shape
            return "", []

        # 1. create vars in pserver program to startup program
        pserver_vars = pserver_program.global_block().vars
        created_var_map = collections.OrderedDict()
        for _, var in six.iteritems(pserver_vars):
            tmpvar = s_prog.global_block()._clone_variable(var)
            created_var_map[var.name] = tmpvar

        # 2. rename op outputs
        for op in orig_s_prog.global_block().ops:
            new_outputs = collections.OrderedDict()
            # do not append startup op if var is not on this pserver
            op_on_pserver = False
            # TODO(gongwb): remove this line.
            if op.type not in ["recv", "fetch_barrier", "concat"]:
                for key in op.output_names:
                    newname, _ = _get_splited_name_and_shape(op.output(key)[0])
                    if newname:
                        op_on_pserver = True
                        new_outputs[key] = created_var_map[newname]
                    elif op.output(key)[0] in pserver_vars:
                        op_on_pserver = True
                        new_outputs[key] = pserver_vars[op.output(key)[0]]

            if op_on_pserver:
                # most startup program ops have no inputs
                new_inputs = self._get_input_map_from_op(pserver_vars, op)

                if op.type in [
                        "gaussian_random", "fill_constant", "uniform_random",
                        "truncated_gaussian_random"
                ]:
                    op._set_attr("shape", list(new_outputs["Out"].shape))
                s_prog.global_block().append_op(type=op.type,
                                                inputs=new_inputs,
                                                outputs=new_outputs,
                                                attrs=op.all_attrs())

        return s_prog
Esempio n. 4
0
def static(train_data,
           loss_in_switch=True,
           use_cuda=False,
           use_parallel_exe=False):
    startup_program = Program()
    main_program = Program()
    startup_program.random_seed = SEED
    main_program.random_seed = SEED

    with program_guard(main_program, startup_program):

        def double_fc_net(image):
            hidden = layers.fc(
                image,
                size=FC_SIZE,
                act='relu',
                param_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Constant(value=0.99)),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Constant(value=0.5)),
                name="hidden")

            prediction = layers.fc(
                hidden,
                size=CLASS_NUM,
                act='softmax',
                param_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Constant(value=1.2)),
                bias_attr=fluid.ParamAttr(
                    initializer=fluid.initializer.Constant(value=0.8)),
                name="prediction")
            return hidden, prediction

        def fn_1(opt, avg_loss=None, pred=None, label=None):
            if avg_loss is None:
                loss = layers.cross_entropy(input=pred, label=label)
                avg_loss = layers.mean(loss, name='mean_cross_entropy_loss')
            opt.minimize(avg_loss)
            return avg_loss

        def fn_2(opt, avg_loss=None, pred=None, label=None):
            if avg_loss is None:
                loss = layers.softmax_with_cross_entropy(logits=pred,
                                                         label=label)
                avg_loss = layers.mean(loss, name='mean_softmax_loss')
            opt.minimize(avg_loss)
            return avg_loss

        image = fluid.data('image', [BATCH_SIZE, INPUT_SIZE], 'float32')
        label = fluid.data('label', [BATCH_SIZE, 1], 'int64')
        hidden, prediction = double_fc_net(image)

        adam = optimizer.Adam(learning_rate=LR)
        sgd = optimizer.SGD(learning_rate=LR)

        id = fluid.data('id', [1], 'int32')
        two = layers.fill_constant([1], 'int32', 2)
        mod_two = layers.elementwise_mod(id, two) == 0

        if loss_in_switch:
            avg_loss = layers.case(
                [(mod_two, lambda: fn_1(adam, None, prediction, label))],
                lambda: fn_2(sgd, None, prediction, label))
        else:
            loss_1 = layers.cross_entropy(input=prediction, label=label)
            avg_loss_1 = layers.mean(loss_1)
            loss_2 = layers.softmax_with_cross_entropy(logits=prediction,
                                                       label=label)
            avg_loss_2 = layers.mean(loss_2)
            avg_loss = layers.case([(mod_two, lambda: fn_1(adam, avg_loss_1))],
                                   lambda: fn_2(sgd, avg_loss_2))

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_program)

    for epoch in range(EPOCH_NUM):
        feed_image, feed_label = train_data[epoch]
        fetch_list = [hidden, prediction, avg_loss]
        feed = {
            'image': feed_image,
            'label': feed_label,
            'id': np.array([epoch]).astype('int32')
        }
        out = exe.run(main_program, feed=feed, fetch_list=fetch_list)
        out_hidden, out_pred, loss = out

    return out_hidden, out_pred, loss