def run(self, ops, batch=False, **kwargs): # ensure variables are initialized uninit_vars = [] for var in self.global_variables(): if var not in self.initialized_variables: uninit_vars.append(var) if uninit_vars: desc = '\n '.join(v.op.name for v in uninit_vars) log.warn('Variables are not initialized:\n {}'.format(desc)) self.raw_run(tf.variables_initializer(uninit_vars)) self.initialized_variables += uninit_vars # assign overrider hyperparameters self._overrider_assign_parameters() # session run if batch: results, statistics = self.raw_run( (ops, self.estimator.operations), **kwargs) # update statistics self.estimator.append(statistics) text = self.estimator.format(batch_size=self.batch_size) log.info(text, update=True) if log.is_enabled('debug'): self.estimator.debug() else: results = self.raw_run(ops, **kwargs) return results
def subtract_channel_means(self, i): means = self.moment.get('mean') if not means: log.warn('Channel means not supplied, defaulting ' 'to 0.5 for each channel.') means = [0.5] * i.shape[-1] shape = [1, 1, len(means)] means = tf.constant(means, shape=shape, name='image_means') return i - means
def _warn_ties(ties, num_ties, thresholds): iterer = enumerate(zip(ties, num_ties, thresholds)) for i, (each_ties, each_num_ties, each_threshold) in iterer: if each_num_ties == 1: continue log.warn( 'Top-k of batch index {} has {} tie values {}.' .format(i, int(each_num_ties), int(each_threshold)), once='ties') return num_ties
def _estimate_layer(self, node, in_info): out_info = super()._estimate_layer(node, in_info) log.debug('Estimated statistics for {!r}: {}.'.format( node.formatted_name(), out_info)) for k, o in self.overriders.get(node, {}).items(): if k == ['gradient', 'normalization']: log.warn('Normalization/gradient estimation not supported.') continue out_info = o.estimate(out_info, in_info) log.debug('Overrider {!r} modified statistics: {}.'.format( o, out_info)) return out_info
def save(self, key): cp_path = self._path(key, True) if isinstance(key, int): log.info('Saving checkpoint at epoch {} to {!r}...'.format( key, cp_path)) else: log.info('Saving checkpoint to {!r}...'.format(cp_path)) try: saver = tf.train.Saver(self._global_variables()) saver.save(self.tf_session, cp_path, write_meta_graph=False) except tf.errors.ResourceExhaustedError: log.warn('Unable to save a checkpoint because we have ' 'no space left on device.')
def normalize_channels(self, i): # FIXME we pin this augmentation action to GPU because of # poor performance on CPU caused by this. with tf.device('/gpu:0'): i = self.subtract_channel_means(i) stds = self.moment.get('std') if not stds: log.warn('Channel std value not supplied, defaulting ' 'to 1.0 for each channel.') return i shape = [1, 1, len(stds)] stds = tf.constant(stds, shape=shape, name='image_stds') return i / stds
def _update_policy(self, tensor): """ simple brute-force, optimal result. """ w = self.eval(self.width) for p in range(-w, w + 1): rate = self._quantize(tensor, point=p, width=w, compute_overflow_rate=True) if rate <= self.overflow_rate: return p log.warn('Cannot find a binary point position that satisfies the ' 'overflow_rate budget, using integer (point at the right ' 'of LSB) instead.') return w
def _auto_select_gpus(num_gpus, memory_bound): try: info = subprocess.check_output('nvidia-smi', shell=True, stderr=subprocess.STDOUT) info = re.findall(r'(\d+)MiB\s/', info.decode('utf-8')) log.debug('GPU memory usages (MB): {}'.format(', '.join(info))) info = [int(m) for m in info] gpus = [i for i in range(len(info)) if info[i] <= memory_bound] except subprocess.CalledProcessError: gpus = [] if len(gpus) < num_gpus: log.warn('Number of GPUs available {} is less than the number of ' 'GPUs requested {}.'.format(len(gpus), num_gpus)) return ','.join(str(g) for g in gpus[:num_gpus])
def search(self, params): max_bound = params.get('max') if max_bound is None: raise ValueError('require max value to search for {}', self.__name__) targets = params.get('targets') if targets is None or 'point' not in targets: raise ValueError('Required targets are not specified') w = self.eval(self.width) max_value = 2**(w - 1) for p in range(-2 * w, w + 1): shift = 2.0**(p) if max_bound <= max_value * shift: return {'point': w + p} log.warn('Cannot find a binary point position that satisfies the ' 'overflow_rate budget, using integer (point at the right ' 'of LSB) instead.') return {'point': w}
def _add_var_scope(self, node, params, scope_list): path = '/'.join(node.module) if not path: raise ValueError('Module path is empty.') forward_overriders = params.pop('overrider', None) or {} gradient_overriders = forward_overriders.pop('gradient', {}) for key, overrider in gradient_overriders.items(): if params.pop('{}_regularizer'.format(key), None): log.warn( 'Regularizer for \'{}/{}\' is for now disabled as we ' 'override its gradient with {!r}.' .format(node.formatted_name(), key, overrider)) def custom_getter(getter, name, *args, **kwargs): v = getter(name, *args, **kwargs) log.debug('Variable {} created.'.format(v)) key = name.replace('{}/'.format(node.formatted_name()), '') overrider = forward_overriders.get(key) if overrider: log.debug( 'Overriding {!r} with {!r}.'.format(name, overrider)) v = overrider.apply(node, name, getter, v) # gradient overrider overrider = gradient_overriders.get(key) if overrider and self.is_training: v = self._apply_gradient_overrider(node, name, overrider, v) self.variables.setdefault(node, {})[key] = v return v @contextlib.contextmanager def custom_scope(): # we do not have direct access to variable creation, # so scope must be used. # FIXME there is currently no possible workaround for # auto-generated `name_scope` from `variable_scope` with names that # are being uniquified. See #39. var_scope = tf.variable_scope( path, reuse=self.reuse, custom_getter=custom_getter) with var_scope as scope: yield scope scope_list.append(custom_scope())
def _update(self): # update positives mask and mean values value = self.session.run(self.before) # divide them into two groups # mean = util.mean(value) mean = 0.0 # find two central points positives = value > mean self.positives = positives self.positives_mean = util.mean(value[util.where(positives)]) negatives = util.logical_and(util.logical_not(positives), value != 0) self.negatives_mean = util.mean(value[util.where(negatives)]) if self.positives_mean.eval() == 0 or self.negatives_mean.eval() == 0: log.warn( 'means are skewed, pos mean is {} and neg mean is {}'.format( self.positives_mean.eval(), self.negatives_mean.eval())) # update internal quantizer self.quantizer.update() for quantizer in self.parameter_quantizers.values(): quantizer.update()
def create_overrider(overriders): for name, p in overriders.items(): if p.get('type'): continue raise TypeError( 'We expect a mapping of name-overrider pairs, overrider ' 'named {!r} does not have a type.'.format(name)) if all(not p.get('_priority') for p in overriders.values()): log.warn( 'Priority not specified for a sequence of overriders ' 'in layer {!r}, which may result in unexpected ordering.' .format(layer_node.formatted_name())) overriders = list(reversed(sorted( overriders.values(), key=lambda p: p.get('_priority', 0)))) overriders = [ cls(session=self.session, **p) for cls, p in multi_objects_from_params(overriders)] if len(overriders) == 1: return overriders[0] return ChainOverrider(session=self.session, overriders=overriders)
def load(self, key=_checkpoint_latest): if key is False or (key != 0 and not key): log.debug('Checkpoint loading disabled.') return [] try: path = self._path(key, False) except CheckpointManifestNotFoundError as e: log.warn('{} Abort load.'.format(e)) return [] reader = tf.train.NewCheckpointReader(path) var_shape_map = reader.get_variable_to_shape_map() var_dtype_map = reader.get_variable_to_dtype_map() restore_vars = [] missing_vars = [] for v in self._global_variables(): base_name, _ = v.name.split(':') shape = var_shape_map.get(base_name, None) if shape is None: missing_vars.append(base_name) continue v_shape = v.shape.as_list() if shape != v_shape: v_shape = format_shape(v_shape) shape = format_shape(shape) log.warn( 'Variable named {!r} has shape ({}) mismatching ' 'the shape ({}) in checkpoint, not loading it.'.format( base_name, v_shape, shape)) continue dtype = var_dtype_map.get(base_name, None).base_dtype v_dtype = v.dtype.base_dtype if dtype != v_dtype: log.warn( 'Variable named {!r} has dtype {!r} mismatching ' 'the dtype {!r} in checkpoint, not loading it.'.format( base_name, v_dtype.name, dtype.name)) continue restore_vars.append(v) # variable not restored not_restore_vars = [] restore_var_names = [v.name.split(':')[0] for v in restore_vars] for v in var_shape_map: if v not in restore_var_names: not_restore_vars.append(v) desc = 'Variables in checkpoint but not restored' print_variables(desc, not_restore_vars, 'warn') # variables missing desc = 'Variables to be restored but missing in checkpoint' print_variables(desc, missing_vars, 'warn') # variables to restore desc = 'Checkpoint variables to restore' print_variables(desc, (v.name for v in restore_vars), 'debug') # restore restorer = tf.train.Saver(restore_vars) restorer.restore(self.tf_session, path) log.debug('Checkpoint restored.') return restore_vars
def load(self, key=_checkpoint_latest): if key is False or (key != 0 and not key): log.debug('Checkpoint loading disabled.') return [] try: path = self._path(key, False) except CheckpointManifestNotFoundError as e: log.warn('{} Abort load.'.format(e)) return [] reader = tf.train.NewCheckpointReader(path) var_shape_map = reader.get_variable_to_shape_map() restore_vars = [] missing_vars = [] for v in self._global_variables(): base_name, _ = v.name.split(':') shape = var_shape_map.get(base_name, None) if shape is None: missing_vars.append(base_name) continue v_shape = v.shape.as_list() if shape != v_shape: msg = ('Variable named {!r} has shape ({}) mismatch with the ' 'shape ({}) in checkpoint, not loading it.') msg = msg.format(base_name, format_shape(v_shape), format_shape(shape)) log.warn(msg) continue restore_vars.append(v) # variable not restored not_restore_vars = [] restore_var_names = [v.name.split(':')[0] for v in restore_vars] for v in var_shape_map: if v not in restore_var_names: not_restore_vars.append(v) if not_restore_vars: log.debug( 'Variables in checkpoint but not restored:\n {}'.format( '\n '.join(not_restore_vars))) # variables missing if missing_vars: log.warn('Variables missing in checkpoint:\n {}'.format( '\n '.join(missing_vars))) log.debug('Checkpoint variables to restore:\n {}'.format( '\n '.join(v.name for v in restore_vars))) restorer = tf.train.Saver(restore_vars) restorer.restore(self.tf_session, path) log.debug('Checkpoint restored.') return restore_vars
def _init_system_config(self): root = os.path.dirname(__file__) self.yaml_update(os.path.join(root, 'system.yaml')) if os.environ.pop('CUDA_VISIBLE_DEVICES', None): log.warn('Ignoring "CUDA_VISIBLE_DEVICES", as it is overridden ' 'by "system.visible_gpus".')