def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) main_block = block.program.global_block() # Create beta1 and beta2 power tensors beta_shape = [1] self._beta1_pow_acc = self.helper.create_global_variable( name=unique_name.generate('beta1_pow_acc'), dtype='float32', shape=beta_shape, lod_level=0, persistable=True) self.helper.set_variable_initializer(self._beta1_pow_acc, initializer=Constant(self._beta1)) self._beta2_pow_acc = self.helper.create_global_variable( name=unique_name.generate('beta2_pow_acc'), dtype='float32', shape=beta_shape, lod_level=0, persistable=True) self.helper.set_variable_initializer(self._beta2_pow_acc, initializer=Constant(self._beta2)) # Create accumulator tensors for first and second moments for p in parameters: self._add_accumulator(self._moment1_acc_str, p) self._add_accumulator(self._moment2_acc_str, p)
def __norm_op(x, out=None, p=2, dim=None, keep_dim=False, block=self.startup_program.global_block()): if out is None: out = block.create_var( name=unique_name.generate(".".join( [self.name, 'weight_norm_norm'])), dtype=dtype, persistable=False) abs_out = block.create_var( name=unique_name.generate(".".join( [self.name, 'weight_norm_abs'])), dtype=dtype, persistable=False) block.append_op( type='abs', inputs={'X': x}, outputs={'Out': abs_out}) pow_out = block.create_var( name=unique_name.generate(".".join( [self.name, 'weight_norm_pow'])), dtype=dtype, persistable=False) block.append_op( type='pow', inputs={'X': abs_out}, outputs={'Out': pow_out}, attrs={'factor': float(p)}) sum_out = block.create_var( name=unique_name.generate(".".join( [self.name, 'weight_norm_sum'])), dtype=dtype, persistable=False) block.append_op( type='reduce_sum', inputs={'X': pow_out}, outputs={'Out': sum_out}, attrs={ 'dim': dim, 'keep_dim': keep_dim, 'reduce_all': True if dim is None else False }) block.append_op( type='pow', inputs={'X': sum_out}, outputs={'Out': out}, attrs={'factor': 1. / p}) return out
def create_parameter(self, attr, shape, dtype, is_bias=False, default_initializer=None): # Deepcopy the attr so that parameters can be shared in program attr = copy.deepcopy(attr) assert isinstance(attr, ParamAttr) suffix = 'b' if is_bias else 'w' if attr.name is None: attr.name = unique_name.generate(".".join([self.name, suffix])) if default_initializer is None and attr.initializer is None: if is_bias: attr.set_default_bias_initializer() else: attr.set_default_param_initializer() else: attr.set_default_initializer(default_initializer) # If weight normalization is set, insert extra parameters and ops. # Refer to https://arxiv.org/pdf/1602.07868.pdf if isinstance(attr, WeightNormParamAttr): param = self._create_weight_normalize(attr, shape, dtype) WeightNormParamAttr.params_with_weight_norm.append(param) return param self.startup_program.global_block().create_parameter( dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True)) return self.main_program.global_block().create_parameter( dtype=dtype, shape=shape, **attr.to_kwargs())
def __norm_except_dim(x, out=None, dim=None, block=self.startup_program.global_block()): """Computes the norm over all dimensions except dim""" if out is None: out = block.create_var( name=unique_name.generate(".".join( [self.name, 'weight_norm_norm'])), dtype=dtype, persistable=False) if dim is None: __norm_op(x, out, dim=dim, block=block) elif dim == 0: out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1) reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block) norm = __norm_op(reshape, dim=1, block=block) __reshape_op(norm, out=out, shape=out_shape, block=block) elif dim == len(x.shape) - 1: out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]] reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block) norm = __norm_op(reshape, dim=0, block=block) __reshape_op(norm, out=out, shape=out_shape, block=block) else: perm = range(len(x.shape)) perm[0], perm[dim] = dim, 0 transpose = __transpose_op(x, perm, block=block) norm = __norm_op(transpose, dim=0, block=block) __transpose_op(norm, perm, out=out, block=block) return out
def _add_accumulator(self, name, param, dtype=None, fill_value=0.0, shape=None): """Utility function to add an accumulator for a parameter Args: block: the block in which the loss variable is present name: name of the accumulator param: parameter variable for which accumulator is to be added dtype: data type of the accumulator variable fill_value: value to initialize the accumulator variable """ if (name in self._accumulators and param.name in self._accumulators[name]): raise Exception( "Accumulator {} already exists for parameter {}".format( name, param.name)) if shape == None: shape = param.shape assert isinstance(self.helper, LayerHelper) var = self.helper.create_global_variable( name=unique_name.generate(name), persistable=True, dtype=dtype or param.dtype, type=param.type, shape=shape) self.helper.set_variable_initializer( var, initializer=Constant(value=float(fill_value))) self._accumulators[name][param.name] = var return var
def __transpose_op(x, axis, out=None, block=self.startup_program.global_block()): if out is None: out = block.create_var( name=unique_name.generate(".".join( [self.name, 'weight_norm_transpose'])), dtype=dtype, persistable=False) block.append_op( type='transpose', inputs={'X': x}, outputs={'Out': out}, attrs={'axis': axis}) return out
def _create_accumulators(self, block, parameters): # Create beta1 power accumulator tensor beta_shape = [1] self._beta1_pow_acc = self.helper.create_global_variable( name=unique_name.generate('beta1_pow_acc'), dtype='float32', shape=beta_shape, lod_level=0, persistable=True) self.helper.set_variable_initializer(self._beta1_pow_acc, initializer=Constant(self._beta1)) # Create accumulator tensors for first moment and infinity norm for p in parameters: self._add_accumulator(self._moment_acc_str, p) self._add_accumulator(self._inf_norm_acc_str, p)
def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map): var_map = copy.copy(target_grad_map) for op_idx in range(start_op_idx, block.desc.op_size()): op_desc = block.desc.op(op_idx) for name in op_desc.input_arg_names(): if name in var_map: op_desc.rename_input(name, var_map[name]) for name in op_desc.output_arg_names(): if block.desc.find_var(name.encode("ascii")): new_name = unique_name.generate(name) op_desc.rename_output(name, new_name) var_map[name] = new_name for g, ng in var_map.iteritems(): if g in grad_to_var: grad_to_var[ng] = grad_to_var[g] grad_to_var.pop(g)
def _create_state(self, suffix, dtype, shape): """ Create state variable. Args: suffix(str): the state suffix. dtype(str|core.VarDesc.VarType): the state data type shape(tuple|list): the shape of state Returns: State variable """ state = self.helper.create_variable(name="_".join( [unique_name.generate(self.helper.name), suffix]), persistable=True, dtype=dtype, shape=shape) self.states.append(state) return state
def _create_global_learning_rate(self): lr = self.global_learning_rate() if isinstance(lr, framework.Variable): return else: if not isinstance(self._learning_rate, float): raise TypeError( "learning rate variable is create outside optimizer," "can not create new learning rate variable for new program" ) # create learning rate in the current main program self._learning_rate_map[ framework.default_main_program()] = layers.create_global_var( name=unique_name.generate("learning_rate"), shape=[1], value=float(self._learning_rate), dtype='float32', persistable=True)
def __init__(self, average_window_rate, params_grads=None, min_average_window=10000, max_average_window=10000, **kwargs): super(ModelAverage, self).__init__(0.0, **kwargs) self.average_window = average_window_rate self.min_average_window = min_average_window self.max_average_window = max_average_window self.params_grads = [] if params_grads is None else params_grads params = {} for param, grad in self.params_grads: if param.do_model_average != False: params[param.name] = (param, grad) for param in framework.default_main_program().global_block( ).all_parameters(): if param.name not in params and param.do_model_average != False: grad = param.block.create_var(name=unique_name.generate( ".".join([param.name, 'tmp'])), dtype=param.dtype, persistable=False, stop_gradient=True) params[param.name] = (param, grad) self.params_grads = params.values() for param, grad in self.params_grads: self._append_average_accumulate_op(param) self.apply_program = Program() block = self.apply_program.global_block() with program_guard(main_program=self.apply_program): for param_grad in self.params_grads: self._add_average_apply_op(block, param_grad) self.restore_program = Program() block = self.restore_program.global_block() with program_guard(main_program=self.restore_program): for param_grad in self.params_grads: self._add_average_restore_op(block, param_grad)
def create_state(self, suffix, dtype, shape): """ Create state variable. NOTE: It is not a public API. Args: suffix(str): the state suffix. dtype(str|core.VarDesc.VarType): the state data type shape(tuple|list): the shape of state Returns: State variable """ state = self.helper.create_variable( name="_".join([unique_name.generate(self.helper.name), suffix]), persistable=True, dtype=dtype, shape=shape) self.states.append(state) return state
def __init__(self, block, type=core.VarDesc.VarType.LOD_TENSOR, name=None, shape=None, dtype=None, lod_level=None, capacity=None, persistable=None, error_clip=None, stop_gradient=False, **kwargs): self.block = block self.error_clip = error_clip if name is None: name = unique_name.generate('_generated_var') is_new_var = False self.desc = self.block.desc.find_var(name) if self.desc is None: self.desc = self.block.desc.var(name) is_new_var = True if is_new_var: self.desc.set_type(type) elif self.desc.type() != type: raise ValueError("Variable {0} has been created before. The " "previous type is {1}; the new type is {2}. They" " are not matched".format(self.name, self.desc.type(), type)) if shape is not None: if is_new_var: self.desc.set_shape(shape) else: old_shape = self.shape shape = tuple(shape) if shape != old_shape: raise ValueError( "Variable {0} has been created before. the previous " "shape is {1}; the new shape is {2}. They are not " "matched.".format(self.name, old_shape, shape)) if dtype is not None: if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) if is_new_var: self.desc.set_dtype(dtype) else: old_dtype = self.dtype if dtype != old_dtype: raise ValueError("Variable {0} has been created before. " "The previous data type is {1}; the new " "data type is {2}. They are not " "matched.".format(self.name, old_dtype, dtype)) if lod_level is not None: if is_new_var: self.desc.set_lod_level(lod_level) else: if lod_level != self.lod_level: raise ValueError("Variable {0} has been created before. " "The previous lod_level is {1}; the new " "lod_level is {2}. They are not " "matched".format(self.name, self.lod_level, lod_level)) if persistable is not None: if is_new_var: self.desc.set_persistable(persistable) else: if persistable != self.persistable: raise ValueError( "Variable {0} has been created before." "The previous persistable is {1}; the new " "persistable is {2}. They are not matched".format( self.name, self.persistable, persistable)) if capacity is not None: if is_new_var: self.desc.set_capacity(capacity) else: # TODO(abhinavarora) : Compare with set capacity once, # get_capacity is implemented pass self.block.vars[name] = self self.op = None self.stop_gradient = stop_gradient
def create_tmp_variable(self, dtype, stop_gradient=False): return self.main_program.current_block().create_var( name=unique_name.generate(".".join([self.name, 'tmp'])), dtype=dtype, persistable=False, stop_gradient=stop_gradient)
def __init__(self, layer_type, **kwargs): self.kwargs = kwargs self.layer_type = layer_type name = self.kwargs.get('name', None) if name is None: self.kwargs['name'] = unique_name.generate(self.layer_type)