Beispiel #1
0
    def _get_pserver_grad_param_var(var, var_dict):
        """
        Return pserver side grad/param variable, return None
        if the variable is not grad/param, e.g.

            a@GRAD -> [email protected]
            a@GRAD -> a@GRAD (a is not split)
            fc_0.w_0 -> fc_0.w_0.block_0
            fc_0.w_0 -> fc_0.w_0 (weight is not split)
            _generated_var_123 -> None
        """

        grad_block = None
        for _, g in six.iteritems(var_dict):
            if _orig_varname(g.name) == _orig_varname(var.name):
                # skip per trainer vars
                if g.name.find(".trainer_") == -1:
                    # only param or grads have split blocks
                    ovar_name = _orig_varname(g.name)
                    if ovar_name in config.param_grad_ep_mapping:
                        grad_block = g
                        break
                    elif ovar_name in config.grad_param_mapping:
                        grad_block = g
                        break

        return grad_block
Beispiel #2
0
    def _append_pserver_grad_merge_ops(optimize_block, grad_varname_for_block,
                                       endpoint, grad_to_block_id):
        trainers = config.get_trainers()

        program = optimize_block.program
        pserver_block = program.global_block()
        grad_block = None

        for g in config.param_grad_ep_mapping[endpoint]["grads"]:
            if _orig_varname(g.name) == \
                    _orig_varname(grad_varname_for_block):
                grad_block = g
                break

        if not grad_block:
            # do not append this op if current endpoint
            # is not dealing with this grad block
            return None

        orig_varname, block_name, trainer_name = _get_varname_parts(
            grad_block.name)

        if block_name:
            merged_var_name = '.'.join([orig_varname, block_name])
        else:
            merged_var_name = orig_varname

        merged_var = pserver_block.create_var(
            name=grad_block.name,
            persistable=True,
            type=grad_block.type,
            dtype=grad_block.dtype,
            shape=grad_block.shape)

        grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
        if config.is_sync_mode() and trainers > 1:
            vars2merge = []
            for i in range(trainers):
                per_trainer_name = "%s.trainer_%d" % \
                                   (merged_var_name, i)
                per_trainer_var = pserver_block.create_var(
                    name=per_trainer_name,
                    persistable=False,
                    type=grad_block.type,
                    dtype=grad_block.dtype,
                    shape=grad_block.shape)
                vars2merge.append(per_trainer_var)

            optimize_block.append_op(
                type="sum",
                inputs={"X": vars2merge},
                outputs={"Out": merged_var},
                attrs={"use_mkldnn": False})
            optimize_block.append_op(
                type="scale",
                inputs={"X": merged_var},
                outputs={"Out": merged_var},
                attrs={"scale": 1.0 / float(trainers)})
        return merged_var
Beispiel #3
0
    def _is_opt_op_on_pserver(endpoint, op):
        param_names = [
            p.name for p in config.param_grad_ep_mapping[endpoint]["params"]
        ]

        unmerged_varnames = []
        merged_varnames = []
        merged_ordernames = []

        for name in param_names:
            orig_varname = _orig_varname(name)

            for pairs in config.merged_variables_pairs:
                merged_p = pairs[0]
                if merged_p.merged_var.name == orig_varname:
                    if merged_p.merged_var.name == merged_p.ordered_vars[
                            0].name:
                        unmerged_varnames.append(merged_p.ordered_vars[0].name)
                    else:
                        merged_varnames.append(merged_p.merged_var.name)
                        merged_ordernames.append(merged_p.ordered_vars[0].name)
                    break

        param = op.input("Param")[0]

        if param in unmerged_varnames:
            return True

        for i in range(len(merged_ordernames)):
            if param == merged_ordernames[i]:
                merged_p = merged_varnames[i]
                merged_g = "{}@GRAD".format(merged_varnames[i])
                op._set_attr(OP_ROLE_VAR_ATTR_NAME, [merged_p, merged_g])
                return True
        return False
Beispiel #4
0
 def get_entry_attr(param_name):
     origin_name = _orig_varname(param_name)
     o_main_program = config.get_origin_main_program()
     for op in o_main_program.global_block().ops:
         if is_distributed_sparse_op(op) and get_sparse_tablename(
                 op) == origin_name:
             entry = op.attr("entry")
             return entry
Beispiel #5
0
def get_distributed_from_listen_and_serv(program, origin_program):
    op = get_op_by_type(program.global_block(), "listen_and_serv")
    sparse_varnames = get_sparse_tablenames(origin_program, True)
    sparse_params = []
    grad_to_params = op.attr('sparse_grad_to_param')
    for grad_to_param in grad_to_params:
        _, param = grad_to_param.split(":")
        if _orig_varname(param) in sparse_varnames:
            sparse_params.append(param)
    return sparse_params
Beispiel #6
0
def add_geo_optimizer_pass(program, config):
    endpoint = config.get_ps_endpoint()
    params = [p for p in config.param_grad_ep_mapping[endpoint]["params"]]

    sparse_tablenames = get_sparse_tablenames(config.get_origin_main_program(),
                                              False)

    for param in params:
        _clone_var(program.global_block(), param)

    optimize_block = []
    sparse_grad_to_param = []
    param_to_block_id = []
    pre_block_idx = program.num_blocks - 1

    for param in params:
        per_opt_block = program._create_block(pre_block_idx)
        optimize_block.append(per_opt_block)
        var_name = param.name
        pserver_block = per_opt_block.program.global_block()
        param = pserver_block.vars[var_name]

        delta_var_name = "%s.delta" % (param.name)
        origin_varname = _orig_varname(param.name)

        if origin_varname in sparse_tablenames:
            sparse_grad_to_param.append(":".join([delta_var_name, param.name]))

        delta_var = pserver_block.create_var(
            name=delta_var_name,
            persistable=False,
            type=param.type,
            dtype=param.dtype,
            shape=param.shape)

        per_opt_block.append_op(
            type="sum",
            inputs={"X": [param, delta_var]},
            outputs={"Out": param})

        param_to_block_id.append(delta_var_name + ":" + str(per_opt_block.idx))

    op = get_op_by_type(program.global_block(), "listen_and_serv")
    op._set_attr("optimize_blocks", optimize_block)
    op._set_attr("grad_to_block_id", param_to_block_id)
    op._set_attr("sparse_grad_to_param", sparse_grad_to_param)

    return program
Beispiel #7
0
    def get_initializer_attrs(acture_value_names):
        l_sep = ","
        l_in = "&"
        init_attrs = []
        o_startup_program = config.get_origin_startup_program()

        for value_name in acture_value_names:
            origin_var_name = _orig_varname(value_name)
            for op in o_startup_program.global_block().ops:
                if op.type in opt_init_map.keys(
                ) and origin_var_name == op.output("Out")[0]:
                    init_attr = [op.type]
                    for attr in opt_init_map[op.type]:
                        init_attr.append(str(op.attr(attr)))
                    init_attrs.append(l_in.join(init_attr))
                    break

        return l_sep.join(init_attrs)
Beispiel #8
0
    def _get_param_block(opt_op):
        # param is already created on global program
        unmerged_vars = []
        merged_vars = []
        merged_ordervars = []

        param_vars = [
            p for p in config.param_grad_ep_mapping[endpoint]["params"]
        ]

        for var in param_vars:
            name = var.name
            orig_varname = _orig_varname(name)

            for pairs in config.merged_variables_pairs:
                merged_p = pairs[0]
                if merged_p.merged_var.name == orig_varname:
                    if merged_p.merged_var.name == merged_p.ordered_vars[
                            0].name:
                        unmerged_vars.append(merged_p.ordered_vars[0])
                    else:
                        merged_vars.append(merged_p.merged_var)
                        merged_ordervars.append(merged_p.ordered_vars[0])
                    break

        param_name = opt_op.input("Param")[0]

        for i in range(len(unmerged_vars)):
            if _same_or_split_var(param_name, unmerged_vars[i].name):
                for var in param_vars:
                    if _same_or_split_var(var.name, unmerged_vars[i].name):
                        return var

        for i in range(len(merged_ordervars)):
            if _same_or_split_var(param_name, merged_ordervars[i].name):
                for var in param_vars:
                    if _same_or_split_var(var.name, merged_vars[i].name):
                        return var
        return None
Beispiel #9
0
def build_pserver_startup_program_pass(program, p_main_program, config):
    ps_endpoint = config.get_ps_endpoint()
    o_startup_program = config.get_origin_startup_program()
    program.random_seed = o_startup_program.random_seed
    params = config.param_grad_ep_mapping[ps_endpoint]["params"]
    merged_ordervars = []

    for var in params:
        name = var.name
        orig_varname = _orig_varname(name)

        for pairs in config.merged_variables_pairs:
            merged_p = pairs[0]
            if merged_p.merged_var.name == orig_varname:
                if merged_p.merged_var.name != merged_p.ordered_vars[0].name:
                    merged_ordervars.append(merged_p.ordered_vars[0])
                break

    def _get_splited_name_and_shape(varname):
        for splited_param in params:
            pname = splited_param.name
            if _same_or_split_var(pname, varname) and varname != pname:
                return pname, splited_param.shape

            for idx, ordered in enumerate(merged_ordervars):
                if _same_or_split_var(varname, ordered.name):
                    return pname, splited_param.shape

        return "", []

    # 1. create vars in pserver program to startup program
    pserver_vars = p_main_program.global_block().vars

    created_var_map = collections.OrderedDict()
    for _, var in six.iteritems(pserver_vars):
        tmpvar = program.global_block()._clone_variable(var)
        created_var_map[var.name] = tmpvar

    # 2. rename op outputs
    for op in o_startup_program.global_block().ops:
        new_outputs = collections.OrderedDict()
        # do not append startup op if var is not on this pserver
        op_on_pserver = False
        # TODO(gongwb) : remove this line.
        if op.type not in ["recv", "fetch_barrier", "concat"]:
            for key in op.output_names:
                newname, _ = _get_splited_name_and_shape(op.output(key)[0])
                if newname:
                    op_on_pserver = True
                    new_outputs[key] = created_var_map[newname]
                elif op.output(key)[0] in pserver_vars:
                    op_on_pserver = True
                    new_outputs[key] = pserver_vars[op.output(key)[0]]

        if op_on_pserver:
            # most startup program ops have no inputs
            new_inputs = _get_input_map_from_op(pserver_vars, op)

            if op.type in [
                    "gaussian_random", "fill_constant", "uniform_random",
                    "truncated_gaussian_random"
            ]:
                op._set_attr("shape", list(new_outputs["Out"].shape))

            program.global_block().append_op(
                type=op.type,
                inputs=new_inputs,
                outputs=new_outputs,
                attrs=op.all_attrs())

    return program
Beispiel #10
0
def large_scale_sparse_pass(program, main_program, config, is_startup=False):
    opt_value_map = {}
    opt_value_map["sgd"] = ["Param"]
    opt_value_map["adam"] = ["Param", "Moment1", "Moment2"]
    opt_value_map["adagrad"] = ["Param", "Moment"]
    opt_value_map["adamax"] = ["Param", "Moment", "InfNorm"]
    opt_value_map["momentum"] = ["Param", "Velocity"]
    opt_value_map["lars_momentum"] = ["Param", "Velocity"]
    opt_value_map["rmsprop"] = ["Param", "Moment", "MeanSquare"]
    opt_value_map["decayed_adagrad"] = ["Param", "Moment"]
    opt_value_map["ftrl"] = ["Param", "SquaredAccumulator", "LinearAccumulator"]

    geo_value_map = {}
    geo_value_map["sum"] = "Param"

    opt_init_map = {}
    opt_init_map["gaussian_random"] = ["seed", "mean", "std"]
    opt_init_map["fill_constant"] = ["value"]
    opt_init_map["uniform_random"] = ["seed", "min", "max"]
    opt_init_map["truncated_gaussian_random"] = ["seed", "mean", "std"]

    def get_entry_attr(param_name):
        origin_name = _orig_varname(param_name)
        o_main_program = config.get_origin_main_program()
        for op in o_main_program.global_block().ops:
            if is_distributed_sparse_op(op) and get_sparse_tablename(
                    op) == origin_name:
                entry = op.attr("entry")
                return entry

    def get_initializer_attrs(acture_value_names):
        l_sep = ","
        l_in = "&"
        init_attrs = []
        o_startup_program = config.get_origin_startup_program()

        for value_name in acture_value_names:
            origin_var_name = _orig_varname(value_name)
            for op in o_startup_program.global_block().ops:
                if op.type in opt_init_map.keys(
                ) and origin_var_name == op.output("Out")[0]:
                    init_attr = [op.type]
                    for attr in opt_init_map[op.type]:
                        init_attr.append(str(op.attr(attr)))
                    init_attrs.append(l_in.join(init_attr))
                    break

        return l_sep.join(init_attrs)

    def get_optimizer_values(block):
        value_names = []
        acture_names = []
        value_dims = []
        grad = None
        opt_idx = -1
        fuse = False

        for op in block.ops:
            opt_idx += 1

            if op.type not in opt_value_map.keys():
                continue

            if op.type in ["sgd", "adam"]:
                fuse = True

            grad = main_program.global_block().vars[op.input("Grad")[0]]

            for value in opt_value_map[op.type]:
                var = main_program.global_block().vars[op.input(value)[0]]
                if len(var.shape) != 2:
                    raise ValueError("sparse param's dimension must be 2")

                value_names.append(value)
                value_dims.append(var.shape[1])
                acture_names.append(var.name)

            if value_names:
                break
        return grad, opt_idx, value_names, value_dims, acture_names, fuse

    def add_fuse_large_scale_op(block, global_block, table_name, value_names,
                                acture_names, grad, is_entry, opt_idx):

        op = block.ops[opt_idx]

        if op.type == "sgd":
            grad = main_program.global_block().vars[op.input("Grad")[0]]
            lr = main_program.global_block().vars[op.input("LearningRate")[0]]

            block._insert_op(
                opt_idx,
                type="lookup_sparse_table_fuse_sgd",
                inputs={"Grad": grad,
                        "LearningRate": lr},
                attrs={
                    "is_entry": is_entry,
                    "tablename": table_name,
                    "value_names": value_names
                })

        elif op.type == "adam":
            grad = main_program.global_block().vars[op.input("Grad")[0]]
            lr = main_program.global_block().vars[op.input("LearningRate")[0]]
            beta1_pow = main_program.global_block().vars[op.input("Beta1Pow")[
                0]]
            beta2_pow = main_program.global_block().vars[op.input("Beta2Pow")[
                0]]
            beta1_pow_o = main_program.global_block().vars[op.output(
                "Beta1PowOut")[0]]
            beta2_pow_o = main_program.global_block().vars[op.output(
                "Beta2PowOut")[0]]

            beta1 = op.attr('beta1')
            beta2 = op.attr('beta2')
            epsilon = op.attr('epsilon')

            block._insert_op(
                opt_idx,
                type="lookup_sparse_table_fuse_adam",
                inputs={
                    "Grad": grad,
                    "LearningRate": lr,
                    "Beta1Pow": beta1_pow,
                    "Beta2Pow": beta2_pow
                },
                outputs={
                    "Beta1PowOut": beta1_pow_o,
                    "Beta2PowOut": beta2_pow_o
                },
                attrs={
                    "beta1": beta1,
                    "beta2": beta2,
                    "epsilon": epsilon,
                    "is_entry": is_entry,
                    "tablename": table_name,
                    "value_names": value_names
                })
        else:
            raise ValueError("only support sgd/adam optimizer now")

    def add_large_scale_op(block, global_block, table_name, value_names,
                           acture_names, grad, is_entry, opt_idx):
        ids = global_block.create_var(
            name="kSparseIDs@{}".format(table_name),
            persistable=False,
            dtype="int64",
            shape=[1, 1],
            lod_level=0)

        # insert grad split to ids and tensor op
        block._insert_op(
            opt_idx,
            type="lookup_sparse_table_grad_split",
            inputs={"Grad": grad},
            outputs={"Row": ids,
                     "Value": grad},
            attrs={"tablename": table_name,
                   "is_entry": is_entry})

        # insert read at first
        vars = [global_block.vars[acture_name] for acture_name in acture_names]
        block._insert_op(
            opt_idx + 1,
            type="lookup_sparse_table_read",
            inputs={"Ids": ids},
            outputs={"Out": vars},
            attrs={"tablename": table_name,
                   "value_names": value_names})

        # append write at last
        inputs = {"Ids": ids, "In": vars}

        block.append_op(
            type="lookup_sparse_table_write",
            inputs=inputs,
            outputs={},
            attrs={"tablename": table_name,
                   "value_names": value_names})

    op = get_op_by_type(main_program.global_block(), "listen_and_serv")

    param_blockid_map = {}
    grad_blockid_map = {}
    grad_to_params = op.attr('sparse_grad_to_param')
    grad_to_block_ids = op.attr('grad_to_block_id')

    origin_program = config.get_origin_main_program()
    sparse_varnames = get_sparse_tablenames(origin_program, False)

    for grad_to_block_id in grad_to_block_ids:
        grad, blockid = grad_to_block_id.split(":")
        grad_blockid_map[grad] = int(blockid)

    for grad_to_param in grad_to_params:
        grad, param = grad_to_param.split(":")

        if _orig_varname(param) in sparse_varnames:
            continue

        param_blockid_map[param] = grad_blockid_map[grad]

    if not is_startup:
        for param, blockid in param_blockid_map.items():
            opt_block = program.block(blockid)

            grad, opt_idx, value_names, value_dims, acture_names, fuse = \
                get_optimizer_values(opt_block)

            entry_attr = get_entry_attr(param)
            is_entry = False if entry_attr == "none" else True

            if fuse:
                add_fuse_large_scale_op(opt_block,
                                        program.global_block(), param,
                                        value_names, acture_names, grad,
                                        is_entry, opt_idx)
            else:
                add_large_scale_op(opt_block,
                                   program.global_block(), param, value_names,
                                   acture_names, grad, is_entry, opt_idx)
    else:
        large_scale_kv_metas = []
        for param, blockid in param_blockid_map.items():
            opt_block = main_program.block(blockid)

            grad, opt_idx, value_names, value_dims, acture_names, fuse = \
                get_optimizer_values(opt_block)

            entry_attr = get_entry_attr(param)

            if fuse:
                # remove origin optimzier op
                opt_block._remove_op(opt_idx)

            # training/infer
            mode = "0"
            names_str = ",".join(value_names)
            dims_str = ",".join([str(dim) for dim in value_dims])
            ids_name = "kSparseIDs@{}".format(param)
            cached_str = ",".join(acture_names + [ids_name])
            init_attr_str = get_initializer_attrs(acture_names)

            meta_str = ":".join([
                param, names_str, dims_str, mode, grad.name, cached_str,
                init_attr_str, entry_attr
            ])
            print("large_scale_metas: {}".format(meta_str))
            large_scale_kv_metas.append(meta_str)

        program.global_block().append_op(
            type="lookup_sparse_table_init",
            inputs=None,
            outputs=None,
            attrs={"large_scale_metas": large_scale_kv_metas})

    # todo: need delete unused var.
    return program