Beispiel #1
0
def delete_optimizer_pass(program, config):
    def _delete_optimizer_op_and_vars(_program, optimize_ops):
        optimize_vars = []
        optimize_op_role_vars = []
        optimize_need_delete_vars = []

        for op in optimize_ops:
            optimize_vars.extend(op.input_arg_names)
            optimize_op_role_vars.extend(op.attr("op_role_var"))

        optimize_vars = list(set(optimize_vars))
        optimize_op_role_vars = list(set(optimize_op_role_vars))

        for var in optimize_vars:
            if var not in optimize_op_role_vars:
                optimize_need_delete_vars.append(var)
        need_delete_optimize_vars = list(set(optimize_need_delete_vars))

        delete_ops(_program.global_block(), optimize_ops)
        for var in need_delete_optimize_vars:
            if _program.global_block().has_var(var):
                _program.global_block()._remove_var(var)

    optimizer_ops = _get_optimize_ops(program)
    lr_ops = _get_lr_ops(program)
    optimizer_ops.extend(lr_ops)
    _delete_optimizer_op_and_vars(program, optimizer_ops)

    return program
Beispiel #2
0
def delete_optimizer_pass(program, config):
    def _delete_optimizer_op_and_vars(_program, optimize_ops):
        optimize_vars = []
        optimize_op_role_vars = []
        optimize_need_delete_vars = []

        for op in optimize_ops:
            optimize_vars.extend(op.input_arg_names)
            optimize_op_role_vars.extend(op.attr("op_role_var"))

        optimize_vars = list(set(optimize_vars))
        optimize_op_role_vars = list(set(optimize_op_role_vars))

        for var in optimize_vars:
            if var not in optimize_op_role_vars:
                optimize_need_delete_vars.append(var)
        need_delete_optimize_vars = list(set(optimize_need_delete_vars))

        delete_ops(_program.global_block(), optimize_ops)
        for var in need_delete_optimize_vars:
            if _program.global_block().has_var(var):
                _program.global_block()._remove_var(var)

    def _add_lr_var(main_program, compiled_config):
        # Todo: hard code for pe
        lr_var = compiled_config.origin_main_program.global_block(
        ).vars["learning_rate_0"]
        main_program.global_block().create_var(name=lr_var.name,
                                               shape=lr_var.shape,
                                               dtype=lr_var.dtype,
                                               type=lr_var.type,
                                               lod_level=lr_var.lod_level,
                                               persistable=True)

    optimizer_ops = _get_optimize_ops(program)
    lr_ops = _get_lr_ops(program)
    optimizer_ops.extend(lr_ops)
    _delete_optimizer_op_and_vars(program, optimizer_ops)

    if hasattr(config.origin_main_program, 'lr_sheduler'):
        _add_lr_var(program, config)

    return program
Beispiel #3
0
    def _init_worker(self):
        def sync_strategy_envs():
            kwargs = {}
            kwargs[
                "pserver_endpoints"] = self.role_maker._get_pserver_endpoints(
                )
            kwargs["trainer_id"] = self.role_maker._worker_index()
            return kwargs

        def geo_strategy_envs():
            from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames

            def get_sparse_attrs():
                opt_init_map = {}
                opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
                opt_init_map["fill_constant"] = ["value"]
                opt_init_map["uniform_random"] = ["seed", "min", "max"]
                opt_init_map["truncated_gaussian_random"] = [
                    "seed", "mean", "std"
                ]

                dist_varnames = get_sparse_tablenames(self.origin_main_program,
                                                      True)
                sparse_varnames = get_sparse_tablenames(
                    self.origin_main_program, False)

                if len(dist_varnames) != 0:
                    raise ValueError(
                        "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding"
                    )

                init_attrs = []
                for value_name in sparse_varnames:
                    value_var = self.origin_main_program.global_block(
                    ).vars[value_name]
                    value_attr = [
                        value_name,
                        ",".join([str(dim) for dim in value_var.shape])
                    ]
                    for op in self.origin_startup_program.global_block().ops:
                        if op.type in opt_init_map.keys(
                        ) and value_name == op.output("Out")[0]:
                            init_attr = [op.type]
                            for attr in opt_init_map[op.type]:
                                init_attr.append(str(op.attr(attr)))
                            value_attr.append("&".join(init_attr))
                            init_attrs.append(":".join(value_attr))
                            break
                return "#".join(init_attrs)

            kwargs = {}
            kwargs["trainers"] = self.role_maker._worker_num()
            kwargs["sparse_attrs"] = get_sparse_attrs()
            return kwargs

        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops, _has_global_step

        from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import \
            SyncStrategy, GeoStrategy

        trainer_config = self.async_strategy.get_trainer_runtime_config()
        print(trainer_config)

        dist_strategy = self.context["valid_strategy"]
        launch_barrier = dist_strategy.a_sync_configs["launch_barrier"]
        if launch_barrier:
            # for trainer wait server ready
            wait_server_ready(self.role_maker._get_pserver_endpoints())

            # for ps-heter mode, wait heter worker ready
            if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
            ):
                wait_server_ready(
                    self.role_maker._get_heter_worker_endpoints())

        lrs = _has_global_step(_get_lr_ops(self.origin_main_program))

        if lrs:
            kwargs = {"need_global_step": "1"}
        else:
            kwargs = {"need_global_step": "0"}

        if isinstance(self.async_strategy, GeoStrategy):
            geo_kwargs = geo_strategy_envs()
            kwargs.update(geo_kwargs)
        if isinstance(self.async_strategy, SyncStrategy):
            sync_kwargs = sync_strategy_envs()
            kwargs.update(sync_kwargs)

        kwargs = kwargs if kwargs else None

        send_ctx = self.compiled_strategy.get_communicator_send_context()

        if self.compiled_strategy.is_geo_mode():
            recv_ctx = self.compiled_strategy.get_communicator_recv_context(
                recv_type=4)
        else:
            recv_ctx = self.compiled_strategy.get_communicator_recv_context(
                recv_type=1)

        from paddle.fluid.communicator import Communicator
        self._communicator = Communicator(
            trainer_config.mode, kwargs,
            trainer_config.get_communicator_flags())
        self._communicator.init_with_ctx(send_ctx, recv_ctx)

        if not self._communicator.is_running():
            self._communicator.start()
        else:
            warnings.warn("communicator has been initialized, skip")
Beispiel #4
0
def add_optimizer_pass(program, config):
    def _append_pserver_grad_merge_ops(optimize_block, grad_varname_for_block,
                                       endpoint, grad_to_block_id):
        trainers = config.get_trainers()

        program = optimize_block.program
        pserver_block = program.global_block()
        grad_block = None

        for g in config.param_grad_ep_mapping[endpoint]["grads"]:
            if _orig_varname(g.name) == \
                    _orig_varname(grad_varname_for_block):
                grad_block = g
                break

        if not grad_block:
            # do not append this op if current endpoint
            # is not dealing with this grad block
            return None

        orig_varname, block_name, trainer_name = _get_varname_parts(
            grad_block.name)

        if block_name:
            merged_var_name = '.'.join([orig_varname, block_name])
        else:
            merged_var_name = orig_varname

        merged_var = pserver_block.create_var(
            name=grad_block.name,
            persistable=True,
            type=grad_block.type,
            dtype=grad_block.dtype,
            shape=grad_block.shape)

        grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
        if config.is_sync_mode() and trainers > 1:
            vars2merge = []
            for i in range(trainers):
                per_trainer_name = "%s.trainer_%d" % \
                                   (merged_var_name, i)
                per_trainer_var = pserver_block.create_var(
                    name=per_trainer_name,
                    persistable=False,
                    type=grad_block.type,
                    dtype=grad_block.dtype,
                    shape=grad_block.shape)
                vars2merge.append(per_trainer_var)

            optimize_block.append_op(
                type="sum",
                inputs={"X": vars2merge},
                outputs={"Out": merged_var},
                attrs={"use_mkldnn": False})
            optimize_block.append_op(
                type="scale",
                inputs={"X": merged_var},
                outputs={"Out": merged_var},
                attrs={"scale": 1.0 / float(trainers)})
        return merged_var

    origin_program = config.get_origin_main_program()
    origin_program = origin_program.clone()
    ps_endpoint = config.get_ps_endpoint()

    opt_op_on_pserver = []
    # Iterate through the ops, and if an op and the optimize ops
    # which located on current pserver are in one set, then
    # append it into the sub program.
    global_ops = []
    # sparse grad name to param name
    sparse_grad_to_param = []

    def _is_opt_op_on_pserver(endpoint, op):
        param_names = [
            p.name for p in config.param_grad_ep_mapping[endpoint]["params"]
        ]

        unmerged_varnames = []
        merged_varnames = []
        merged_ordernames = []

        for name in param_names:
            orig_varname = _orig_varname(name)

            for pairs in config.merged_variables_pairs:
                merged_p = pairs[0]
                if merged_p.merged_var.name == orig_varname:
                    if merged_p.merged_var.name == merged_p.ordered_vars[
                            0].name:
                        unmerged_varnames.append(merged_p.ordered_vars[0].name)
                    else:
                        merged_varnames.append(merged_p.merged_var.name)
                        merged_ordernames.append(merged_p.ordered_vars[0].name)
                    break

        param = op.input("Param")[0]

        if param in unmerged_varnames:
            return True

        for i in range(len(merged_ordernames)):
            if param == merged_ordernames[i]:
                merged_p = merged_varnames[i]
                merged_g = "{}@GRAD".format(merged_varnames[i])
                op._set_attr(OP_ROLE_VAR_ATTR_NAME, [merged_p, merged_g])
                return True
        return False

    def __append_optimize_op__(op, block, grad_to_block_id, merged_var, lr_ops):
        if _is_optimizer_op(op):
            _append_pserver_ops(block, op, ps_endpoint, grad_to_block_id,
                                origin_program, merged_var,
                                sparse_grad_to_param, config)
        elif op not in lr_ops:
            _append_pserver_non_opt_ops(block, op, origin_program, config)

    optimize_ops = _get_optimize_ops(origin_program)
    for _, op in enumerate(optimize_ops):
        if _is_optimizer_op(op) and _is_opt_op_on_pserver(ps_endpoint, op):
            opt_op_on_pserver.append(op)

    # append lr decay ops to the child block if exists
    lr_ops = _get_lr_ops(origin_program)
    has_lr_decay = True if len(lr_ops) > 0 else False
    lr_decay_block_id = -1
    optimize_blocks = []

    if has_lr_decay > 0:
        counter_increment_idx = -1
        for idx, op in enumerate(lr_ops):
            if op.type != 'increment':
                continue
            counter = op.input("X")[0]
            if counter == LEARNING_RATE_DECAY_COUNTER:
                counter_increment_idx = idx
                break

        if counter_increment_idx != -1:
            lr_ops.pop(counter_increment_idx)

        lr_decay_block = program._create_block(program.num_blocks - 1)
        optimize_blocks.append(lr_decay_block)
        for op in lr_ops:
            cloned_op = _append_pserver_non_opt_ops(lr_decay_block, op,
                                                    origin_program, config)
            # append sub blocks to pserver_program in lr_decay_op
            # todo(tangwei12): __clone_lr_op_sub_block__
        lr_decay_block_id = lr_decay_block.idx

    # append op to the current block
    grad_to_block_id = []
    pre_block_idx = program.num_blocks - 1

    for idx, opt_op in enumerate(opt_op_on_pserver):
        per_opt_block = program._create_block(pre_block_idx)
        optimize_blocks.append(per_opt_block)
        optimize_target_param_name = opt_op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
        # append grad merging ops before clip and weight decay
        # e.g.merge grad->L2Decay op->clip op->optimize
        merged_var = None
        for _, op in enumerate(optimize_ops):
            # find the origin grad var before clipping / L2Decay,
            # merged_var should be the input var name of L2Decay
            grad_varname_for_block = op.attr(OP_ROLE_VAR_ATTR_NAME)[1]
            if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name:
                merged_var = _append_pserver_grad_merge_ops(
                    per_opt_block, grad_varname_for_block, ps_endpoint,
                    grad_to_block_id)
                if merged_var:
                    break  # append optimize op once then append other ops.

        if merged_var:
            for _, op in enumerate(optimize_ops):
                # optimizer is connected to itself
                if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \
                        op not in global_ops:
                    __append_optimize_op__(op, per_opt_block, grad_to_block_id,
                                           merged_var, lr_ops)

    # dedup grad to ids list
    grad_to_block_id = list(set(grad_to_block_id))
    # append global ops
    if global_ops:
        opt_state_block = program._create_block(program.num_blocks - 1)
        optimize_blocks.append(opt_state_block)
        for glb_op in global_ops:
            __append_optimize_op__(glb_op, opt_state_block, grad_to_block_id,
                                   None, lr_ops)

    if len(optimize_blocks) == 0:
        pre_block_idx = program.num_blocks - 1
        empty_block = program._create_block(pre_block_idx)
        optimize_blocks.append(empty_block)

    op = get_op_by_type(program.global_block(), "listen_and_serv")
    op._set_attr("optimize_blocks", optimize_blocks)
    op._set_attr("grad_to_block_id", grad_to_block_id)
    op._set_attr("sparse_grad_to_param", sparse_grad_to_param)
    op._set_attr("lr_decay_block_id", lr_decay_block_id)
    return program
Beispiel #5
0
    def _init_transpiler_worker(self):
        """
        `init_worker` has many many functions to do before training,
        first, wait for all parameter servers launch completely.
        second, run executor to initialize startup program
        third, wait for all worker initialize completely.

        Returns:
            None
        """
        def sync_strategy_envs():
            kwargs = {}
            kwargs[
                "pserver_endpoints"] = self._role_maker.get_pserver_endpoints(
                )
            kwargs["trainer_id"] = self._role_maker.worker_index()
            return kwargs

        def geo_strategy_envs():
            def get_sparse_attrs():
                opt_init_map = {}
                opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
                opt_init_map["fill_constant"] = ["value"]
                opt_init_map["uniform_random"] = ["seed", "min", "max"]
                opt_init_map["truncated_gaussian_random"] = [
                    "seed", "mean", "std"
                ]

                dist_varnames = get_sparse_tablenames(
                    self._origin_main_program, True)
                sparse_varnames = get_sparse_tablenames(
                    self._origin_main_program, False)

                if len(dist_varnames) != 0:
                    raise ValueError(
                        "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding"
                    )

                init_attrs = []
                for value_name in sparse_varnames:
                    value_var = self._origin_main_program.global_block(
                    ).vars[value_name]
                    value_attr = [
                        value_name,
                        ",".join([str(dim) for dim in value_var.shape])
                    ]
                    for op in self._origin_startup_program.global_block().ops:
                        if op.type in opt_init_map.keys(
                        ) and value_name == op.output("Out")[0]:
                            init_attr = [op.type]
                            for attr in opt_init_map[op.type]:
                                init_attr.append(str(op.attr(attr)))
                            value_attr.append("&".join(init_attr))
                            init_attrs.append(":".join(value_attr))
                            break
                return "#".join(init_attrs)

            kwargs = {}
            kwargs["trainers"] = self.worker_num()
            kwargs["sparse_attrs"] = get_sparse_attrs()
            return kwargs

        # if MPISymetricRoleMaker is defined
        # we suppose a user wants to submit job on mpi cluster

        if isinstance(self._role_maker, MPISymetricRoleMaker):
            # check whether server has been initialized
            wait_server_ready(self.server_endpoints(to_string=False))

        trainer_config = self._strategy.get_trainer_runtime_config()

        print(trainer_config)

        lrs = _has_global_step(_get_lr_ops(self._origin_main_program))

        if lrs > 0:
            kwargs = {"need_global_step": "1"}
        else:
            kwargs = {"need_global_step": "0"}

        if isinstance(self._strategy, GeoStrategy):
            geo_kwargs = geo_strategy_envs()
            kwargs.update(geo_kwargs)
        if isinstance(self._strategy, SyncStrategy):
            sync_kwargs = sync_strategy_envs()
            kwargs.update(sync_kwargs)

        kwargs = kwargs if kwargs else None

        send_ctx = fleet.compiled_config.get_communicator_send_context()

        if self.compiled_config.is_geo_mode():
            recv_ctx = fleet.compiled_config.get_communicator_recv_context(
                recv_type=4)
        else:
            recv_ctx = fleet.compiled_config.get_communicator_recv_context(
                recv_type=1)

        from paddle.fluid.communicator import Communicator
        self._communicator = Communicator(
            trainer_config.mode, kwargs,
            trainer_config.get_communicator_flags())

        self._communicator.init_with_ctx(send_ctx, recv_ctx)

        if not self._communicator.is_running():
            self._communicator.start()
        else:
            raise ValueError(
                "Communicator can only be inited once, please check")