Exemple #1
0
def delete_optimizer_pass(program, config):
    def _delete_optimizer_op_and_vars(_program, optimize_ops):
        optimize_vars = []
        optimize_op_role_vars = []
        optimize_need_delete_vars = []

        for op in optimize_ops:
            optimize_vars.extend(op.input_arg_names)
            optimize_op_role_vars.extend(op.attr("op_role_var"))

        optimize_vars = list(set(optimize_vars))
        optimize_op_role_vars = list(set(optimize_op_role_vars))

        for var in optimize_vars:
            if var not in optimize_op_role_vars:
                optimize_need_delete_vars.append(var)
        need_delete_optimize_vars = list(set(optimize_need_delete_vars))

        delete_ops(_program.global_block(), optimize_ops)
        for var in need_delete_optimize_vars:
            if _program.global_block().has_var(var):
                _program.global_block()._remove_var(var)

    optimizer_ops = _get_optimize_ops(program)
    lr_ops = _get_lr_ops(program)
    optimizer_ops.extend(lr_ops)
    _delete_optimizer_op_and_vars(program, optimizer_ops)

    return program
Exemple #2
0
def delet_extra_optimizes_pass(program, config):
    optimize_vars = []
    optimize_op_role_vars = []
    optimize_need_delete_vars = []

    origin_program = config.get_origin_main_program()
    for op in _get_optimize_ops(origin_program):
        optimize_vars.extend(op.input_arg_names)
        optimize_op_role_vars.extend(op.attr("op_role_var"))

    optimize_vars = list(set(optimize_vars))
    optimize_op_role_vars = list(set(optimize_op_role_vars))

    for var in optimize_vars:
        if var not in optimize_op_role_vars:
            optimize_need_delete_vars.append(var)
    need_delete_optimize_vars = list(set(optimize_need_delete_vars))

    init_ops = []
    for var in need_delete_optimize_vars:
        param_init_op = []
        for op in program.global_block().ops:
            if var in op.output_arg_names:
                param_init_op.append(op)
        init_ops.extend(param_init_op)
    delete_ops(program.global_block(), init_ops)

    for var in need_delete_optimize_vars:
        if program.global_block().has_var(var):
            program.global_block()._remove_var(var)

    return program
Exemple #3
0
    def _get_optimizer_op(self, param_name):
        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops

        opts = _get_optimize_ops(self.origin_main_program)
        for op in opts:
            if "Param" in op.input_names and \
                    "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
                return op
Exemple #4
0
    def _build_pserver_programs(self, compiled_config):
        _main = fluid.Program()
        _startup = fluid.Program()

        from paddle.fluid.incubate.fleet.parameter_server.ir import pserver_pass as server

        if not compiled_config.is_geo_mode():

            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
            is_sgd_adam = False

            main_program = compiled_config.get_origin_main_program()
            ops = _get_optimize_ops(main_program)

            if len(ops) == 0:
                return _main, _startup

            from paddle.fluid.incubate.fleet.parameter_server.ir.public import _add_lr_decay_table_pass
            lr_decay_steps = self.user_defined_strategy.a_sync_configs[
                "lr_decay_steps"]
            _add_lr_decay_table_pass(main_program, compiled_config,
                                     lr_decay_steps)

            for op in ops:
                if op.type in ["sgd", "adam"]:
                    is_sgd_adam = True
                    break

            if is_sgd_adam:
                return _main, _startup

            _main = server.add_listen_and_serv_pass(_main, compiled_config)
            _main = server.add_rpc_global_flags_pass(_main, compiled_config)
            _main = server.add_optimizer_pass(_main, compiled_config)
            _main = server.large_scale_sparse_pass(_main, _main,
                                                   compiled_config, False)
            _startup = server.build_pserver_startup_program_pass(
                _startup, _main, compiled_config)
            _startup = server.large_scale_sparse_pass(_startup, _main,
                                                      compiled_config, True)

            if not compiled_config.is_sync_mode():
                _main = server.delete_unused_in_main_pass(
                    _main, compiled_config)

            _startup = server.delete_unused_in_startup_pass(
                _startup, _main, compiled_config)
        else:
            _main = server.add_listen_and_serv_pass(_main, compiled_config)
            _main = server.add_rpc_global_flags_pass(_main, compiled_config)
            _main = server.add_geo_optimizer_pass(_main, compiled_config)
            _startup = server.build_pserver_startup_program_pass(
                _startup, _main, compiled_config)
            _startup = server.delete_unused_in_startup_pass(
                _startup, _main, compiled_config)

        return _main, _startup
Exemple #5
0
def delete_optimizer_pass(program, config):
    def _delete_optimizer_op_and_vars(_program, optimize_ops):
        optimize_vars = []
        optimize_op_role_vars = []
        optimize_need_delete_vars = []

        for op in optimize_ops:
            optimize_vars.extend(op.input_arg_names)
            optimize_op_role_vars.extend(op.attr("op_role_var"))

        optimize_vars = list(set(optimize_vars))
        optimize_op_role_vars = list(set(optimize_op_role_vars))

        for var in optimize_vars:
            if var not in optimize_op_role_vars:
                optimize_need_delete_vars.append(var)
        need_delete_optimize_vars = list(set(optimize_need_delete_vars))

        delete_ops(_program.global_block(), optimize_ops)
        for var in need_delete_optimize_vars:
            if _program.global_block().has_var(var):
                _program.global_block()._remove_var(var)

    def _add_lr_var(main_program, compiled_config):
        # Todo: hard code for pe
        lr_var = compiled_config.origin_main_program.global_block(
        ).vars["learning_rate_0"]
        main_program.global_block().create_var(name=lr_var.name,
                                               shape=lr_var.shape,
                                               dtype=lr_var.dtype,
                                               type=lr_var.type,
                                               lod_level=lr_var.lod_level,
                                               persistable=True)

    optimizer_ops = _get_optimize_ops(program)
    lr_ops = _get_lr_ops(program)
    optimizer_ops.extend(lr_ops)
    _delete_optimizer_op_and_vars(program, optimizer_ops)

    if hasattr(config.origin_main_program, 'lr_sheduler'):
        _add_lr_var(program, config)

    return program
Exemple #6
0
    def parse_by_optimizer(self, grad_name, is_sparse, total_dims,
                           compiled_strategy):
        from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops
        param_name = compiled_strategy.grad_name_to_param_name[grad_name]
        main_program, startup_program = compiled_strategy.get_origin_programs()
        pserver_id = compiled_strategy.get_role_id()
        pserver_num = len(compiled_strategy.get_ps_endpoints())
        optimizer_ops = _get_optimize_ops(main_program)
        oop = None

        for op in optimizer_ops:
            if ("Param" in op.input_names) and (op.input("Param")[0]
                                                == param_name):
                oop = op
                break

        if oop is None:
            raise ValueError("can not find optimizer for {}".format(grad_name))

        params = []
        dims = []
        attrs = []
        initializers = []

        self.trainer_num = compiled_strategy.get_trainers()

        if compiled_strategy.is_geo_mode():
            param_varnames = self.opt_input_map["sum"]
            attr_varnames = self.opt_attr_map["sum"]
            self.accessor_class = "sum"
        else:
            param_varnames = self.opt_input_map[oop.type]
            attr_varnames = self.opt_attr_map[oop.type]
            self.accessor_class = oop.type

        for (formal_name, shape) in param_varnames:
            params.append(formal_name)
            param = main_program.global_block().vars[oop.input(formal_name)[0]]
            if formal_name == "LearningRate" and param.name != "learning_rate_0":
                warnings.warn("will support decay soon")
                param = main_program.global_block().vars["learning_rate_0"]

            if shape is None:
                if is_sparse:
                    shape = total_dims
                else:
                    shape = self.get_shard(total_dims, pserver_num, pserver_id)
            dims.append(shape)

            initializer = self.get_initializer_attr(param.name,
                                                    startup_program)
            initializers.append(initializer)

        for (attr_varname, type_) in attr_varnames:
            value = oop.attr(attr_varname)
            attrs.append("&".join([attr_varname, type_, str(value)]))

        self.params = params
        self.dims = dims
        self.initializers = initializers
        self.attrs = attrs
def add_optimizer_pass(program, config):
    def _append_pserver_grad_merge_ops(optimize_block, grad_varname_for_block,
                                       endpoint, grad_to_block_id):
        trainers = config.get_trainers()

        program = optimize_block.program
        pserver_block = program.global_block()
        grad_block = None

        for g in config.param_grad_ep_mapping[endpoint]["grads"]:
            if _orig_varname(g.name) == \
                    _orig_varname(grad_varname_for_block):
                grad_block = g
                break

        if not grad_block:
            # do not append this op if current endpoint
            # is not dealing with this grad block
            return None

        orig_varname, block_name, trainer_name = _get_varname_parts(
            grad_block.name)

        if block_name:
            merged_var_name = '.'.join([orig_varname, block_name])
        else:
            merged_var_name = orig_varname

        merged_var = pserver_block.create_var(
            name=grad_block.name,
            persistable=True,
            type=grad_block.type,
            dtype=grad_block.dtype,
            shape=grad_block.shape)

        grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
        if config.is_sync_mode() and trainers > 1:
            vars2merge = []
            for i in range(trainers):
                per_trainer_name = "%s.trainer_%d" % \
                                   (merged_var_name, i)
                per_trainer_var = pserver_block.create_var(
                    name=per_trainer_name,
                    persistable=False,
                    type=grad_block.type,
                    dtype=grad_block.dtype,
                    shape=grad_block.shape)
                vars2merge.append(per_trainer_var)

            optimize_block.append_op(
                type="sum",
                inputs={"X": vars2merge},
                outputs={"Out": merged_var},
                attrs={"use_mkldnn": False})
            optimize_block.append_op(
                type="scale",
                inputs={"X": merged_var},
                outputs={"Out": merged_var},
                attrs={"scale": 1.0 / float(trainers)})
        return merged_var

    origin_program = config.get_origin_main_program()
    origin_program = origin_program.clone()
    ps_endpoint = config.get_ps_endpoint()

    opt_op_on_pserver = []
    # Iterate through the ops, and if an op and the optimize ops
    # which located on current pserver are in one set, then
    # append it into the sub program.
    global_ops = []
    # sparse grad name to param name
    sparse_grad_to_param = []

    def _is_opt_op_on_pserver(endpoint, op):
        param_names = [
            p.name for p in config.param_grad_ep_mapping[endpoint]["params"]
        ]

        unmerged_varnames = []
        merged_varnames = []
        merged_ordernames = []

        for name in param_names:
            orig_varname = _orig_varname(name)

            for pairs in config.merged_variables_pairs:
                merged_p = pairs[0]
                if merged_p.merged_var.name == orig_varname:
                    if merged_p.merged_var.name == merged_p.ordered_vars[
                            0].name:
                        unmerged_varnames.append(merged_p.ordered_vars[0].name)
                    else:
                        merged_varnames.append(merged_p.merged_var.name)
                        merged_ordernames.append(merged_p.ordered_vars[0].name)
                    break

        param = op.input("Param")[0]

        if param in unmerged_varnames:
            return True

        for i in range(len(merged_ordernames)):
            if param == merged_ordernames[i]:
                merged_p = merged_varnames[i]
                merged_g = "{}@GRAD".format(merged_varnames[i])
                op._set_attr(OP_ROLE_VAR_ATTR_NAME, [merged_p, merged_g])
                return True
        return False

    def __append_optimize_op__(op, block, grad_to_block_id, merged_var, lr_ops):
        if _is_optimizer_op(op):
            _append_pserver_ops(block, op, ps_endpoint, grad_to_block_id,
                                origin_program, merged_var,
                                sparse_grad_to_param, config)
        elif op not in lr_ops:
            _append_pserver_non_opt_ops(block, op, origin_program, config)

    optimize_ops = _get_optimize_ops(origin_program)
    for _, op in enumerate(optimize_ops):
        if _is_optimizer_op(op) and _is_opt_op_on_pserver(ps_endpoint, op):
            opt_op_on_pserver.append(op)

    # append lr decay ops to the child block if exists
    lr_ops = _get_lr_ops(origin_program)
    has_lr_decay = True if len(lr_ops) > 0 else False
    lr_decay_block_id = -1
    optimize_blocks = []

    if has_lr_decay > 0:
        counter_increment_idx = -1
        for idx, op in enumerate(lr_ops):
            if op.type != 'increment':
                continue
            counter = op.input("X")[0]
            if counter == LEARNING_RATE_DECAY_COUNTER:
                counter_increment_idx = idx
                break

        if counter_increment_idx != -1:
            lr_ops.pop(counter_increment_idx)

        lr_decay_block = program._create_block(program.num_blocks - 1)
        optimize_blocks.append(lr_decay_block)
        for op in lr_ops:
            cloned_op = _append_pserver_non_opt_ops(lr_decay_block, op,
                                                    origin_program, config)
            # append sub blocks to pserver_program in lr_decay_op
            # todo(tangwei12): __clone_lr_op_sub_block__
        lr_decay_block_id = lr_decay_block.idx

    # append op to the current block
    grad_to_block_id = []
    pre_block_idx = program.num_blocks - 1

    for idx, opt_op in enumerate(opt_op_on_pserver):
        per_opt_block = program._create_block(pre_block_idx)
        optimize_blocks.append(per_opt_block)
        optimize_target_param_name = opt_op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
        # append grad merging ops before clip and weight decay
        # e.g.merge grad->L2Decay op->clip op->optimize
        merged_var = None
        for _, op in enumerate(optimize_ops):
            # find the origin grad var before clipping / L2Decay,
            # merged_var should be the input var name of L2Decay
            grad_varname_for_block = op.attr(OP_ROLE_VAR_ATTR_NAME)[1]
            if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name:
                merged_var = _append_pserver_grad_merge_ops(
                    per_opt_block, grad_varname_for_block, ps_endpoint,
                    grad_to_block_id)
                if merged_var:
                    break  # append optimize op once then append other ops.

        if merged_var:
            for _, op in enumerate(optimize_ops):
                # optimizer is connected to itself
                if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \
                        op not in global_ops:
                    __append_optimize_op__(op, per_opt_block, grad_to_block_id,
                                           merged_var, lr_ops)

    # dedup grad to ids list
    grad_to_block_id = list(set(grad_to_block_id))
    # append global ops
    if global_ops:
        opt_state_block = program._create_block(program.num_blocks - 1)
        optimize_blocks.append(opt_state_block)
        for glb_op in global_ops:
            __append_optimize_op__(glb_op, opt_state_block, grad_to_block_id,
                                   None, lr_ops)

    if len(optimize_blocks) == 0:
        pre_block_idx = program.num_blocks - 1
        empty_block = program._create_block(pre_block_idx)
        optimize_blocks.append(empty_block)

    op = get_op_by_type(program.global_block(), "listen_and_serv")
    op._set_attr("optimize_blocks", optimize_blocks)
    op._set_attr("grad_to_block_id", grad_to_block_id)
    op._set_attr("sparse_grad_to_param", sparse_grad_to_param)
    op._set_attr("lr_decay_block_id", lr_decay_block_id)
    return program
Exemple #8
0
 def _get_optimizer_op(self, param_name):
     opts = public._get_optimize_ops(self._origin_main_program)
     for op in opts:
         if "Param" in op.input_names and \
                 "LearningRate" in op.input_names and op.input("Param")[0] == param_name:
             return op