def get_distributed_from_listen_and_serv(program, origin_program): op = get_op_by_type(program.global_block(), "listen_and_serv") sparse_varnames = get_sparse_tablenames(origin_program, True) sparse_params = [] grad_to_params = op.attr('sparse_grad_to_param') for grad_to_param in grad_to_params: _, param = grad_to_param.split(":") if _orig_varname(param) in sparse_varnames: sparse_params.append(param) return sparse_params
def _save_sparse_params(self, executor, dirname, context, main_program, mode): from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames distributed_varnames = get_sparse_tablenames( self.compiled_strategy.origin_main_program, True) values = [] for id, names in context.items(): if names not in distributed_varnames: # only save sparse param to local self._worker.recv_and_save_model(id, dirname) # save sparse & distributed param on server self._worker.save_one_model(id, dirname, mode) values.extend(names) return values
def get_sparse_attrs(): opt_init_map = {} opt_init_map["gaussian_random"] = ["seed", "mean", "std"] opt_init_map["fill_constant"] = ["value"] opt_init_map["uniform_random"] = ["seed", "min", "max"] opt_init_map["truncated_gaussian_random"] = [ "seed", "mean", "std" ] dist_varnames = get_sparse_tablenames(self.origin_main_program, True) sparse_varnames = get_sparse_tablenames( self.origin_main_program, False) if len(dist_varnames) != 0: raise ValueError( "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding" ) init_attrs = [] for value_name in sparse_varnames: value_var = self.origin_main_program.global_block( ).vars[value_name] value_attr = [ value_name, ",".join([str(dim) for dim in value_var.shape]) ] for op in self.origin_startup_program.global_block().ops: if op.type in opt_init_map.keys( ) and value_name == op.output("Out")[0]: init_attr = [op.type] for attr in opt_init_map[op.type]: init_attr.append(str(op.attr(attr))) value_attr.append("&".join(init_attr)) init_attrs.append(":".join(value_attr)) break return "#".join(init_attrs)
def add_geo_optimizer_pass(program, config): endpoint = config.get_ps_endpoint() params = [p for p in config.param_grad_ep_mapping[endpoint]["params"]] sparse_tablenames = get_sparse_tablenames(config.get_origin_main_program(), False) for param in params: _clone_var(program.global_block(), param) optimize_block = [] sparse_grad_to_param = [] param_to_block_id = [] pre_block_idx = program.num_blocks - 1 for param in params: per_opt_block = program._create_block(pre_block_idx) optimize_block.append(per_opt_block) var_name = param.name pserver_block = per_opt_block.program.global_block() param = pserver_block.vars[var_name] delta_var_name = "%s.delta" % (param.name) origin_varname = _orig_varname(param.name) if origin_varname in sparse_tablenames: sparse_grad_to_param.append(":".join([delta_var_name, param.name])) delta_var = pserver_block.create_var( name=delta_var_name, persistable=False, type=param.type, dtype=param.dtype, shape=param.shape) per_opt_block.append_op( type="sum", inputs={"X": [param, delta_var]}, outputs={"Out": param}) param_to_block_id.append(delta_var_name + ":" + str(per_opt_block.idx)) op = get_op_by_type(program.global_block(), "listen_and_serv") op._set_attr("optimize_blocks", optimize_block) op._set_attr("grad_to_block_id", param_to_block_id) op._set_attr("sparse_grad_to_param", sparse_grad_to_param) return program
def _init_server(self, dirname=None, var_names=None, **kwargs): if self.role_maker._is_heter_worker(): self._init_heter_worker() return role_id = self.compiled_strategy.get_role_id() endpoints = self.compiled_strategy.get_ps_endpoints() is_sync = self.compiled_strategy.is_sync_mode() trainers = self.compiled_strategy.get_trainers() server = self._get_fleet_proto(is_server=True, is_sync=is_sync) proto_txt = str(server) debug = bool(os.getenv("PSERVER_DEBUG", "0")) if debug: print("server: \n{}".format(proto_txt)) string_hosts = [] for idx, ep in enumerate(endpoints): host, port = ep.split(":") pshost = fluid.core.PSHost(host, int(port), idx) string_hosts.append(pshost.serialize_to_string()) self._server = fluid.core.DistFleetWrapper() self._server.init_server(proto_txt, string_hosts, role_id, trainers, self._server_sub_program) from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_tablenames dist_varnames = get_sparse_tablenames(self.origin_main_program, True) sparse_varnames = get_sparse_tablenames(self.origin_main_program, False) distributed_varnames = dist_varnames + sparse_varnames if var_names is None: load_varnames = distributed_varnames else: for var_name in var_names: if var_name not in distributed_varnames: raise ValueError( "fleet.init server can only load sparse variables in {}" .format(distributed_varnames)) load_varnames = var_names if dirname is None or not load_varnames: return sparse_table_maps = {} for table in server.servers[0].tables: if table.type == "PS_SPARSE_TABLE" and table.common is not None: sparse_table_maps[table.common.table_name] = table.id dirname = os.path.normpath(dirname) pserver_id = self.role_maker._role_id() import time begin = time.time() for var_name in load_varnames: table_id = sparse_table_maps[var_name] path = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX, "{}.block{}.txt".format(var_name, pserver_id)) meta = os.path.join(dirname, var_name + PSERVER_SAVE_SUFFIX, "{}.block{}.meta".format(var_name, pserver_id)) self._server.load_sparse(path, meta, table_id) end = time.time() print("init sparse variables: {} cost time: {}".format( load_varnames, end - begin))
def _get_sparse_table_names(): dist_varnames = get_sparse_tablenames(origin_program, True) sparse_varnames = get_sparse_tablenames(origin_program, False) return list(set(dist_varnames + sparse_varnames))
def large_scale_sparse_pass(program, main_program, config, is_startup=False): opt_value_map = {} opt_value_map["sgd"] = ["Param"] opt_value_map["adam"] = ["Param", "Moment1", "Moment2"] opt_value_map["adagrad"] = ["Param", "Moment"] opt_value_map["adamax"] = ["Param", "Moment", "InfNorm"] opt_value_map["momentum"] = ["Param", "Velocity"] opt_value_map["lars_momentum"] = ["Param", "Velocity"] opt_value_map["rmsprop"] = ["Param", "Moment", "MeanSquare"] opt_value_map["decayed_adagrad"] = ["Param", "Moment"] opt_value_map["ftrl"] = ["Param", "SquaredAccumulator", "LinearAccumulator"] geo_value_map = {} geo_value_map["sum"] = "Param" opt_init_map = {} opt_init_map["gaussian_random"] = ["seed", "mean", "std"] opt_init_map["fill_constant"] = ["value"] opt_init_map["uniform_random"] = ["seed", "min", "max"] opt_init_map["truncated_gaussian_random"] = ["seed", "mean", "std"] def get_entry_attr(param_name): origin_name = _orig_varname(param_name) o_main_program = config.get_origin_main_program() for op in o_main_program.global_block().ops: if is_distributed_sparse_op(op) and get_sparse_tablename( op) == origin_name: entry = op.attr("entry") return entry def get_initializer_attrs(acture_value_names): l_sep = "," l_in = "&" init_attrs = [] o_startup_program = config.get_origin_startup_program() for value_name in acture_value_names: origin_var_name = _orig_varname(value_name) for op in o_startup_program.global_block().ops: if op.type in opt_init_map.keys( ) and origin_var_name == op.output("Out")[0]: init_attr = [op.type] for attr in opt_init_map[op.type]: init_attr.append(str(op.attr(attr))) init_attrs.append(l_in.join(init_attr)) break return l_sep.join(init_attrs) def get_optimizer_values(block): value_names = [] acture_names = [] value_dims = [] grad = None opt_idx = -1 fuse = False for op in block.ops: opt_idx += 1 if op.type not in opt_value_map.keys(): continue if op.type in ["sgd", "adam"]: fuse = True grad = main_program.global_block().vars[op.input("Grad")[0]] for value in opt_value_map[op.type]: var = main_program.global_block().vars[op.input(value)[0]] if len(var.shape) != 2: raise ValueError("sparse param's dimension must be 2") value_names.append(value) value_dims.append(var.shape[1]) acture_names.append(var.name) if value_names: break return grad, opt_idx, value_names, value_dims, acture_names, fuse def add_fuse_large_scale_op(block, global_block, table_name, value_names, acture_names, grad, is_entry, opt_idx): op = block.ops[opt_idx] if op.type == "sgd": grad = main_program.global_block().vars[op.input("Grad")[0]] lr = main_program.global_block().vars[op.input("LearningRate")[0]] block._insert_op( opt_idx, type="lookup_sparse_table_fuse_sgd", inputs={"Grad": grad, "LearningRate": lr}, attrs={ "is_entry": is_entry, "tablename": table_name, "value_names": value_names }) elif op.type == "adam": grad = main_program.global_block().vars[op.input("Grad")[0]] lr = main_program.global_block().vars[op.input("LearningRate")[0]] beta1_pow = main_program.global_block().vars[op.input("Beta1Pow")[ 0]] beta2_pow = main_program.global_block().vars[op.input("Beta2Pow")[ 0]] beta1_pow_o = main_program.global_block().vars[op.output( "Beta1PowOut")[0]] beta2_pow_o = main_program.global_block().vars[op.output( "Beta2PowOut")[0]] beta1 = op.attr('beta1') beta2 = op.attr('beta2') epsilon = op.attr('epsilon') block._insert_op( opt_idx, type="lookup_sparse_table_fuse_adam", inputs={ "Grad": grad, "LearningRate": lr, "Beta1Pow": beta1_pow, "Beta2Pow": beta2_pow }, outputs={ "Beta1PowOut": beta1_pow_o, "Beta2PowOut": beta2_pow_o }, attrs={ "beta1": beta1, "beta2": beta2, "epsilon": epsilon, "is_entry": is_entry, "tablename": table_name, "value_names": value_names }) else: raise ValueError("only support sgd/adam optimizer now") def add_large_scale_op(block, global_block, table_name, value_names, acture_names, grad, is_entry, opt_idx): ids = global_block.create_var( name="kSparseIDs@{}".format(table_name), persistable=False, dtype="int64", shape=[1, 1], lod_level=0) # insert grad split to ids and tensor op block._insert_op( opt_idx, type="lookup_sparse_table_grad_split", inputs={"Grad": grad}, outputs={"Row": ids, "Value": grad}, attrs={"tablename": table_name, "is_entry": is_entry}) # insert read at first vars = [global_block.vars[acture_name] for acture_name in acture_names] block._insert_op( opt_idx + 1, type="lookup_sparse_table_read", inputs={"Ids": ids}, outputs={"Out": vars}, attrs={"tablename": table_name, "value_names": value_names}) # append write at last inputs = {"Ids": ids, "In": vars} block.append_op( type="lookup_sparse_table_write", inputs=inputs, outputs={}, attrs={"tablename": table_name, "value_names": value_names}) op = get_op_by_type(main_program.global_block(), "listen_and_serv") param_blockid_map = {} grad_blockid_map = {} grad_to_params = op.attr('sparse_grad_to_param') grad_to_block_ids = op.attr('grad_to_block_id') origin_program = config.get_origin_main_program() sparse_varnames = get_sparse_tablenames(origin_program, False) for grad_to_block_id in grad_to_block_ids: grad, blockid = grad_to_block_id.split(":") grad_blockid_map[grad] = int(blockid) for grad_to_param in grad_to_params: grad, param = grad_to_param.split(":") if _orig_varname(param) in sparse_varnames: continue param_blockid_map[param] = grad_blockid_map[grad] if not is_startup: for param, blockid in param_blockid_map.items(): opt_block = program.block(blockid) grad, opt_idx, value_names, value_dims, acture_names, fuse = \ get_optimizer_values(opt_block) entry_attr = get_entry_attr(param) is_entry = False if entry_attr == "none" else True if fuse: add_fuse_large_scale_op(opt_block, program.global_block(), param, value_names, acture_names, grad, is_entry, opt_idx) else: add_large_scale_op(opt_block, program.global_block(), param, value_names, acture_names, grad, is_entry, opt_idx) else: large_scale_kv_metas = [] for param, blockid in param_blockid_map.items(): opt_block = main_program.block(blockid) grad, opt_idx, value_names, value_dims, acture_names, fuse = \ get_optimizer_values(opt_block) entry_attr = get_entry_attr(param) if fuse: # remove origin optimzier op opt_block._remove_op(opt_idx) # training/infer mode = "0" names_str = ",".join(value_names) dims_str = ",".join([str(dim) for dim in value_dims]) ids_name = "kSparseIDs@{}".format(param) cached_str = ",".join(acture_names + [ids_name]) init_attr_str = get_initializer_attrs(acture_names) meta_str = ":".join([ param, names_str, dims_str, mode, grad.name, cached_str, init_attr_str, entry_attr ]) print("large_scale_metas: {}".format(meta_str)) large_scale_kv_metas.append(meta_str) program.global_block().append_op( type="lookup_sparse_table_init", inputs=None, outputs=None, attrs={"large_scale_metas": large_scale_kv_metas}) # todo: need delete unused var. return program