def delete_optimizer_pass(program, config): def _delete_optimizer_op_and_vars(_program, optimize_ops): optimize_vars = [] optimize_op_role_vars = [] optimize_need_delete_vars = [] for op in optimize_ops: optimize_vars.extend(op.input_arg_names) optimize_op_role_vars.extend(op.attr("op_role_var")) optimize_vars = list(set(optimize_vars)) optimize_op_role_vars = list(set(optimize_op_role_vars)) for var in optimize_vars: if var not in optimize_op_role_vars: optimize_need_delete_vars.append(var) need_delete_optimize_vars = list(set(optimize_need_delete_vars)) delete_ops(_program.global_block(), optimize_ops) for var in need_delete_optimize_vars: if _program.global_block().has_var(var): _program.global_block()._remove_var(var) optimizer_ops = _get_optimize_ops(program) lr_ops = _get_lr_ops(program) optimizer_ops.extend(lr_ops) _delete_optimizer_op_and_vars(program, optimizer_ops) return program
def delet_extra_optimizes_pass(program, config): optimize_vars = [] optimize_op_role_vars = [] optimize_need_delete_vars = [] origin_program = config.get_origin_main_program() for op in _get_optimize_ops(origin_program): optimize_vars.extend(op.input_arg_names) optimize_op_role_vars.extend(op.attr("op_role_var")) optimize_vars = list(set(optimize_vars)) optimize_op_role_vars = list(set(optimize_op_role_vars)) for var in optimize_vars: if var not in optimize_op_role_vars: optimize_need_delete_vars.append(var) need_delete_optimize_vars = list(set(optimize_need_delete_vars)) init_ops = [] for var in need_delete_optimize_vars: param_init_op = [] for op in program.global_block().ops: if var in op.output_arg_names: param_init_op.append(op) init_ops.extend(param_init_op) delete_ops(program.global_block(), init_ops) for var in need_delete_optimize_vars: if program.global_block().has_var(var): program.global_block()._remove_var(var) return program
def _get_optimizer_op(self, param_name): from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops opts = _get_optimize_ops(self.origin_main_program) for op in opts: if "Param" in op.input_names and \ "LearningRate" in op.input_names and op.input("Param")[0] == param_name: return op
def _build_pserver_programs(self, compiled_config): _main = fluid.Program() _startup = fluid.Program() from paddle.fluid.incubate.fleet.parameter_server.ir import pserver_pass as server if not compiled_config.is_geo_mode(): from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops is_sgd_adam = False main_program = compiled_config.get_origin_main_program() ops = _get_optimize_ops(main_program) if len(ops) == 0: return _main, _startup from paddle.fluid.incubate.fleet.parameter_server.ir.public import _add_lr_decay_table_pass lr_decay_steps = self.user_defined_strategy.a_sync_configs[ "lr_decay_steps"] _add_lr_decay_table_pass(main_program, compiled_config, lr_decay_steps) for op in ops: if op.type in ["sgd", "adam"]: is_sgd_adam = True break if is_sgd_adam: return _main, _startup _main = server.add_listen_and_serv_pass(_main, compiled_config) _main = server.add_rpc_global_flags_pass(_main, compiled_config) _main = server.add_optimizer_pass(_main, compiled_config) _main = server.large_scale_sparse_pass(_main, _main, compiled_config, False) _startup = server.build_pserver_startup_program_pass( _startup, _main, compiled_config) _startup = server.large_scale_sparse_pass(_startup, _main, compiled_config, True) if not compiled_config.is_sync_mode(): _main = server.delete_unused_in_main_pass( _main, compiled_config) _startup = server.delete_unused_in_startup_pass( _startup, _main, compiled_config) else: _main = server.add_listen_and_serv_pass(_main, compiled_config) _main = server.add_rpc_global_flags_pass(_main, compiled_config) _main = server.add_geo_optimizer_pass(_main, compiled_config) _startup = server.build_pserver_startup_program_pass( _startup, _main, compiled_config) _startup = server.delete_unused_in_startup_pass( _startup, _main, compiled_config) return _main, _startup
def delete_optimizer_pass(program, config): def _delete_optimizer_op_and_vars(_program, optimize_ops): optimize_vars = [] optimize_op_role_vars = [] optimize_need_delete_vars = [] for op in optimize_ops: optimize_vars.extend(op.input_arg_names) optimize_op_role_vars.extend(op.attr("op_role_var")) optimize_vars = list(set(optimize_vars)) optimize_op_role_vars = list(set(optimize_op_role_vars)) for var in optimize_vars: if var not in optimize_op_role_vars: optimize_need_delete_vars.append(var) need_delete_optimize_vars = list(set(optimize_need_delete_vars)) delete_ops(_program.global_block(), optimize_ops) for var in need_delete_optimize_vars: if _program.global_block().has_var(var): _program.global_block()._remove_var(var) def _add_lr_var(main_program, compiled_config): # Todo: hard code for pe lr_var = compiled_config.origin_main_program.global_block( ).vars["learning_rate_0"] main_program.global_block().create_var(name=lr_var.name, shape=lr_var.shape, dtype=lr_var.dtype, type=lr_var.type, lod_level=lr_var.lod_level, persistable=True) optimizer_ops = _get_optimize_ops(program) lr_ops = _get_lr_ops(program) optimizer_ops.extend(lr_ops) _delete_optimizer_op_and_vars(program, optimizer_ops) if hasattr(config.origin_main_program, 'lr_sheduler'): _add_lr_var(program, config) return program
def parse_by_optimizer(self, grad_name, is_sparse, total_dims, compiled_strategy): from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_optimize_ops param_name = compiled_strategy.grad_name_to_param_name[grad_name] main_program, startup_program = compiled_strategy.get_origin_programs() pserver_id = compiled_strategy.get_role_id() pserver_num = len(compiled_strategy.get_ps_endpoints()) optimizer_ops = _get_optimize_ops(main_program) oop = None for op in optimizer_ops: if ("Param" in op.input_names) and (op.input("Param")[0] == param_name): oop = op break if oop is None: raise ValueError("can not find optimizer for {}".format(grad_name)) params = [] dims = [] attrs = [] initializers = [] self.trainer_num = compiled_strategy.get_trainers() if compiled_strategy.is_geo_mode(): param_varnames = self.opt_input_map["sum"] attr_varnames = self.opt_attr_map["sum"] self.accessor_class = "sum" else: param_varnames = self.opt_input_map[oop.type] attr_varnames = self.opt_attr_map[oop.type] self.accessor_class = oop.type for (formal_name, shape) in param_varnames: params.append(formal_name) param = main_program.global_block().vars[oop.input(formal_name)[0]] if formal_name == "LearningRate" and param.name != "learning_rate_0": warnings.warn("will support decay soon") param = main_program.global_block().vars["learning_rate_0"] if shape is None: if is_sparse: shape = total_dims else: shape = self.get_shard(total_dims, pserver_num, pserver_id) dims.append(shape) initializer = self.get_initializer_attr(param.name, startup_program) initializers.append(initializer) for (attr_varname, type_) in attr_varnames: value = oop.attr(attr_varname) attrs.append("&".join([attr_varname, type_, str(value)])) self.params = params self.dims = dims self.initializers = initializers self.attrs = attrs
def add_optimizer_pass(program, config): def _append_pserver_grad_merge_ops(optimize_block, grad_varname_for_block, endpoint, grad_to_block_id): trainers = config.get_trainers() program = optimize_block.program pserver_block = program.global_block() grad_block = None for g in config.param_grad_ep_mapping[endpoint]["grads"]: if _orig_varname(g.name) == \ _orig_varname(grad_varname_for_block): grad_block = g break if not grad_block: # do not append this op if current endpoint # is not dealing with this grad block return None orig_varname, block_name, trainer_name = _get_varname_parts( grad_block.name) if block_name: merged_var_name = '.'.join([orig_varname, block_name]) else: merged_var_name = orig_varname merged_var = pserver_block.create_var( name=grad_block.name, persistable=True, type=grad_block.type, dtype=grad_block.dtype, shape=grad_block.shape) grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx)) if config.is_sync_mode() and trainers > 1: vars2merge = [] for i in range(trainers): per_trainer_name = "%s.trainer_%d" % \ (merged_var_name, i) per_trainer_var = pserver_block.create_var( name=per_trainer_name, persistable=False, type=grad_block.type, dtype=grad_block.dtype, shape=grad_block.shape) vars2merge.append(per_trainer_var) optimize_block.append_op( type="sum", inputs={"X": vars2merge}, outputs={"Out": merged_var}, attrs={"use_mkldnn": False}) optimize_block.append_op( type="scale", inputs={"X": merged_var}, outputs={"Out": merged_var}, attrs={"scale": 1.0 / float(trainers)}) return merged_var origin_program = config.get_origin_main_program() origin_program = origin_program.clone() ps_endpoint = config.get_ps_endpoint() opt_op_on_pserver = [] # Iterate through the ops, and if an op and the optimize ops # which located on current pserver are in one set, then # append it into the sub program. global_ops = [] # sparse grad name to param name sparse_grad_to_param = [] def _is_opt_op_on_pserver(endpoint, op): param_names = [ p.name for p in config.param_grad_ep_mapping[endpoint]["params"] ] unmerged_varnames = [] merged_varnames = [] merged_ordernames = [] for name in param_names: orig_varname = _orig_varname(name) for pairs in config.merged_variables_pairs: merged_p = pairs[0] if merged_p.merged_var.name == orig_varname: if merged_p.merged_var.name == merged_p.ordered_vars[ 0].name: unmerged_varnames.append(merged_p.ordered_vars[0].name) else: merged_varnames.append(merged_p.merged_var.name) merged_ordernames.append(merged_p.ordered_vars[0].name) break param = op.input("Param")[0] if param in unmerged_varnames: return True for i in range(len(merged_ordernames)): if param == merged_ordernames[i]: merged_p = merged_varnames[i] merged_g = "{}@GRAD".format(merged_varnames[i]) op._set_attr(OP_ROLE_VAR_ATTR_NAME, [merged_p, merged_g]) return True return False def __append_optimize_op__(op, block, grad_to_block_id, merged_var, lr_ops): if _is_optimizer_op(op): _append_pserver_ops(block, op, ps_endpoint, grad_to_block_id, origin_program, merged_var, sparse_grad_to_param, config) elif op not in lr_ops: _append_pserver_non_opt_ops(block, op, origin_program, config) optimize_ops = _get_optimize_ops(origin_program) for _, op in enumerate(optimize_ops): if _is_optimizer_op(op) and _is_opt_op_on_pserver(ps_endpoint, op): opt_op_on_pserver.append(op) # append lr decay ops to the child block if exists lr_ops = _get_lr_ops(origin_program) has_lr_decay = True if len(lr_ops) > 0 else False lr_decay_block_id = -1 optimize_blocks = [] if has_lr_decay > 0: counter_increment_idx = -1 for idx, op in enumerate(lr_ops): if op.type != 'increment': continue counter = op.input("X")[0] if counter == LEARNING_RATE_DECAY_COUNTER: counter_increment_idx = idx break if counter_increment_idx != -1: lr_ops.pop(counter_increment_idx) lr_decay_block = program._create_block(program.num_blocks - 1) optimize_blocks.append(lr_decay_block) for op in lr_ops: cloned_op = _append_pserver_non_opt_ops(lr_decay_block, op, origin_program, config) # append sub blocks to pserver_program in lr_decay_op # todo(tangwei12): __clone_lr_op_sub_block__ lr_decay_block_id = lr_decay_block.idx # append op to the current block grad_to_block_id = [] pre_block_idx = program.num_blocks - 1 for idx, opt_op in enumerate(opt_op_on_pserver): per_opt_block = program._create_block(pre_block_idx) optimize_blocks.append(per_opt_block) optimize_target_param_name = opt_op.attr(OP_ROLE_VAR_ATTR_NAME)[0] # append grad merging ops before clip and weight decay # e.g.merge grad->L2Decay op->clip op->optimize merged_var = None for _, op in enumerate(optimize_ops): # find the origin grad var before clipping / L2Decay, # merged_var should be the input var name of L2Decay grad_varname_for_block = op.attr(OP_ROLE_VAR_ATTR_NAME)[1] if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name: merged_var = _append_pserver_grad_merge_ops( per_opt_block, grad_varname_for_block, ps_endpoint, grad_to_block_id) if merged_var: break # append optimize op once then append other ops. if merged_var: for _, op in enumerate(optimize_ops): # optimizer is connected to itself if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \ op not in global_ops: __append_optimize_op__(op, per_opt_block, grad_to_block_id, merged_var, lr_ops) # dedup grad to ids list grad_to_block_id = list(set(grad_to_block_id)) # append global ops if global_ops: opt_state_block = program._create_block(program.num_blocks - 1) optimize_blocks.append(opt_state_block) for glb_op in global_ops: __append_optimize_op__(glb_op, opt_state_block, grad_to_block_id, None, lr_ops) if len(optimize_blocks) == 0: pre_block_idx = program.num_blocks - 1 empty_block = program._create_block(pre_block_idx) optimize_blocks.append(empty_block) op = get_op_by_type(program.global_block(), "listen_and_serv") op._set_attr("optimize_blocks", optimize_blocks) op._set_attr("grad_to_block_id", grad_to_block_id) op._set_attr("sparse_grad_to_param", sparse_grad_to_param) op._set_attr("lr_decay_block_id", lr_decay_block_id) return program
def _get_optimizer_op(self, param_name): opts = public._get_optimize_ops(self._origin_main_program) for op in opts: if "Param" in op.input_names and \ "LearningRate" in op.input_names and op.input("Param")[0] == param_name: return op