def _test_forward_automatic_d_dynamic_d_hyper(self, method, optimizer_hypers=None, **opt_kwargs): iris, x, y, model, w, out, error, accuracy = iris_logistic_regression( method.get_augmentation_multiplier()) shape_w = w.get_shape() scalar_hyper = tf.Variable(1., name='scalar_hyper') vector_hyper = tf.Variable(tf.ones([3]), name='vector_hyper') tr_err = tf.identity(error + tf.reduce_sum(w ** 2) * scalar_hyper + vector_hyper * tf.stack([tf.reduce_sum(w.tensor[:5]), tf.reduce_sum(w.tensor[5:10]), tf.reduce_sum(w.tensor[10:])]), name='training_error') optimizer = method.create(w, loss=tr_err, **opt_kwargs) d_phi_d_scalar_hyper = optimizer.auto_d_dynamics_d_hyper(scalar_hyper) self.assertIsNotNone(d_phi_d_scalar_hyper.tensor) self.assertListEqual(d_phi_d_scalar_hyper.get_shape().as_list(), [shape_w[0].value, 1]) d_phi_d_vector_hyper = optimizer.auto_d_dynamics_d_hyper(vector_hyper) self.assertIsNotNone(d_phi_d_vector_hyper.tensor) self.assertListEqual(d_phi_d_vector_hyper.get_shape().as_list(), [shape_w[0].value, vector_hyper.get_shape()[0].value]) if optimizer_hypers: [self.assertIsNotNone(optimizer.auto_d_dynamics_d_hyper(hyp)) for hyp in as_list(optimizer_hypers)]
def __init__(self, forward_hyper_grad, hyperparameter_optimizers, hyper_projections=None, hyper_step=None): """ Helper class to perform Real Time Hyperparameter optimization. See section 3.3 of Forward and Reverse Gradient-Based Hyperparameter Optimization (https://arxiv.org/abs/1703.01785) :param forward_hyper_grad: instance of `ForwardHyperGradient`. Used to compute hyper-gradients :param hyperparameter_optimizers: single or list of Optimizer for the hyper-parameter descent procedure :param hyper_projections: (optional) list of assign ops that performs projection to onto a convex subset of the hyperparameter space. :param hyper_step: (optional) instance of `GlobalStep` class that keeps tracks of the number of hyper-batches performed so far. """ assert isinstance(forward_hyper_grad, ForwardHyperGradient) self.direct_doh = forward_hyper_grad assert isinstance(hyperparameter_optimizers, (list, Optimizer)), "hyper_opt_dicts should be a single " \ "Optimizer or a list of Optimizer. Instead" \ "is %s" % hyperparameter_optimizers self.hyper_opt_dicts = as_list(hyperparameter_optimizers) self.hyper_projections = hyper_projections or [] self.hyper_step = hyper_step or GlobalStep()
def positivity(hyper_list): """ Simple positivity constraints for a list of hyperparameters :param hyper_list: single variable or list of variable (hyperparameters) :return: single or list of assign ops, one for each variable in `hyper_list` """ lst = [ hyp.assign(tf.maximum(hyp, tf.zeros_like(hyp))) for hyp in as_list(hyper_list) ] return lst if len(lst) > 1 else lst[0]
def __init__(self, fig, plot_streams, prefix='', delay=60, additional_operations=None, start_from=0, stop_at=10000): super(ReadSaveDictThread, self).__init__(daemon=True) self._fig = fig self._stop = threading.Event() self._plot_streams = as_list(plot_streams) self._prefix = prefix self._delay = delay self._additional_operations = additional_operations or [] self._start_from = start_from self._stop_at = stop_at self.read_count = start_from self.exc = []
def continuous_plot(fig, plot_streams, prefix='', delay=120, additional_operations=None, start_from=0): plot_streams = as_list(plot_streams) additional_operations = additional_operations or [] read_count = start_from while threading.current_thread().is_alive(): [op.run() for op in additional_operations] updates = read_stream(prefix=prefix, start=read_count) read_count += len(updates) print(read_count) for upd in updates: [pls.process_save_dict(upd) for pls in plot_streams] [pls.plot() for pls in plot_streams] fig.canvas.draw() time.sleep(delay)
def generate_setting_dict(local_variables, excluded=None): """ Generates a dictionary of (name, values) of local variables (typically obtained by vars()) that can be saved at the beginning of the experiment. Furthermore, if an object obj in local_variables implements the function setting(), it saves the result of obj.setting() as value in the dictionary. :param local_variables: :param excluded: (optional, default []) variable or list of variables to be excluded. :return: A dictionary """ excluded = as_list(excluded) or [] setting_dict = { k: v.setting() if hasattr(v, 'setting') else v for k, v in local_variables.items() if v not in excluded } import datetime setting_dict['datetime'] = str(datetime.datetime.now()) return setting_dict
def run(self, T, train_feed_dict_supplier=None, val_feed_dict_suppliers=None, hyper_constraints_ops=None, _debug_no_hyper_update=False): # TODO add session parameter """ :param _debug_no_hyper_update: :param T: number of steps :param train_feed_dict_supplier: :param val_feed_dict_suppliers: :param hyper_constraints_ops: (list of) either callable (no parameters) or tensorflow ops :return: """ # idea: if steps == T then do full reverse, or forward, otherwise do trho and rtho # after all the main difference is that if we go with the full version, after the gradient has been # computed, the method `initialize()` is called. self.hyper_gradients.run_all( T, train_feed_dict_supplier=train_feed_dict_supplier, val_feed_dict_suppliers=val_feed_dict_suppliers, hyper_batch_step=self.hyper_batch_step.eval()) if not _debug_no_hyper_update: [ tf.get_default_session().run(hod.assign_ops) for hod in self.hyper_optimizers ] if hyper_constraints_ops: [ op() if callable(op) else op.eval() for op in as_list(hyper_constraints_ops) ] self.hyper_batch_step.increase.eval()
def redivide_data(datasets, partition_proportions=None, shuffle=False, filters=None, maps=None, balance_classes=False): """ Function that redivides datasets. Can be use also to shuffle or filter or map examples. :param datasets: original datasets, instances of class Dataset (works with get_data and get_targets for compatibility with mnist datasets :param partition_proportions: (optional, default None) list of fractions that can either sum up to 1 or less then one, in which case one additional partition is created with proportion 1 - sum(partition proportions). If None it will retain the same proportion of samples found in datasets :param shuffle: (optional, default False) if True shuffles the examples :param filters: (optional, default None) filter or list of filters: functions with signature (data, target, index) -> boolean (accept or reject the sample) :param maps: (optional, default None) map or list of maps: functions with signature (data, target, index) -> (new_data, new_target) (maps the old sample to a new one, possibly also to more than one sample, for data augmentation) :return: a list of datasets of length equal to the (possibly augmented) partition_proportion """ all_data = vstack([get_data(d) for d in datasets]) all_labels = stack_or_concat([get_targets(d) for d in datasets]) all_infos = np.concatenate([d.sample_info for d in datasets]) N = all_data.shape[0] if partition_proportions: # argument check partition_proportions = list([partition_proportions] if isinstance(partition_proportions, float) else partition_proportions) sum_proportions = sum(partition_proportions) assert sum_proportions <= 1, "partition proportions must sum up to at most one: %d" % sum_proportions if sum_proportions < 1.: partition_proportions += [1. - sum_proportions] else: partition_proportions = [1. * get_data(d).shape[0] / N for d in datasets] if shuffle: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() # if sk_shuffle: # TODO this does not work!!! find a way to shuffle these matrices while # keeping compatibility with tensorflow! # all_data, all_labels, all_infos = sk_shuffle(all_data, all_labels, all_infos) # else: permutation = np.arange(all_data.shape[0]) np.random.shuffle(permutation) all_data = all_data[permutation] all_labels = np.array(all_labels[permutation]) all_infos = np.array(all_infos[permutation]) if filters: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() filters = as_list(filters) data_triple = [(x, y, d) for x, y, d in zip(all_data, all_labels, all_infos)] for fiat in filters: data_triple = [xy for i, xy in enumerate(data_triple) if fiat(xy[0], xy[1], xy[2], i)] all_data = np.vstack([e[0] for e in data_triple]) all_labels = np.vstack([e[1] for e in data_triple]) all_infos = np.vstack([e[2] for e in data_triple]) if maps: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() maps = as_list(maps) data_triple = [(x, y, d) for x, y, d in zip(all_data, all_labels, all_infos)] for _map in maps: data_triple = [_map(xy[0], xy[1], xy[2], i) for i, xy in enumerate(data_triple)] all_data = np.vstack([e[0] for e in data_triple]) all_labels = np.vstack([e[1] for e in data_triple]) all_infos = np.vstack([e[2] for e in data_triple]) N = all_data.shape[0] assert N == all_labels.shape[0] calculated_partitions = reduce( lambda v1, v2: v1 + [sum(v1) + v2], [int(N * prp) for prp in partition_proportions], [0] ) calculated_partitions[-1] = N print('datasets.redivide_data:, computed partitions numbers -', calculated_partitions, 'len all', N, end=' ') new_general_info_dict = {} for data in datasets: new_general_info_dict = {**new_general_info_dict, **data.info} if balance_classes: new_datasets = [] forbidden_indices = np.empty(0, dtype=np.int64) for d1, d2 in zip(calculated_partitions[:-1], calculated_partitions[1:-1]): indices = np.array(get_indices_balanced_classes(d2 - d1, all_labels, forbidden_indices)) dataset = Dataset(data=all_data[indices], target=all_labels[indices], sample_info=all_infos[indices], info=new_general_info_dict) new_datasets.append(dataset) forbidden_indices = np.append(forbidden_indices, indices) test_if_balanced(dataset) remaining_indices = np.array(list(set(list(range(N))) - set(forbidden_indices))) new_datasets.append(Dataset(data=all_data[remaining_indices], target=all_labels[remaining_indices], sample_info=all_infos[remaining_indices], info=new_general_info_dict)) else: new_datasets = [ Dataset(data=all_data[d1:d2], target=all_labels[d1:d2], sample_info=all_infos[d1:d2], info=new_general_info_dict) for d1, d2 in zip(calculated_partitions, calculated_partitions[1:]) ] print('DONE') return new_datasets
def __init__(self, optimizer, hyper_dict, global_step=None): """ Creates a new object that computes the hyper-gradient of validation errors in forward mode. See section 3.2 of Forward and Reverse Gradient-Based Hyperparameter Optimization (https://arxiv.org/abs/1703.01785) Note that this class only computes the hyper-gradient and does not perform hyperparameter optimization. :param optimizer: instance of Optimizer class, which represent the dynamics with which the model parameters are updated :param hyper_dict: A dictionary of `{validation_error: hyper_pairs_list}` where `validation_error` is a scalar tensor and `hyper_pairs_list` is single or a list of pairs (hyperparameter, derivative_of_dynamics_w.r.t hyperparameter) (matrix B_t in the paper). Unfortunately tensorflow does not computes Jacobians efficiently yet (suggestions or pointer are welcomed) :param global_step: (optional) instance of `GlobalStep` to keep track of the optimization step """ assert isinstance(optimizer, Optimizer) self.w = optimizer.raw_w # might be variable or MergedVariable (never tested on Variables actually) ... self.w_t = MergedVariable.get_tensor(self.w) # this is always a tensor self.tr_dynamics = optimizer.dynamics assert isinstance(hyper_dict, dict), '%s not allowed type. Should be a dict of (tf.Tensor,' \ 'list[(hyper-parameter, d_dynamics_d_hyper-parameter)]' % hyper_dict self.hyper_list = [] # more comfortable to use self.d_dynamics_d_hypers = [] self.hyper_dict = {} # standardizes hyper_dict parameter for k, v in hyper_dict.items(): list_v = as_list(v) assert isinstance(list_v[0], tuple), "Something's wrong in hyper_dict %s, at least in entry%s. Check!"\ % (hyper_dict, list_v[0]) self.hyper_dict[k] = list_v # be sure values are lists! self.hyper_list += [pair[0] for pair in list_v] self.d_dynamics_d_hypers += [pair[1] for pair in list_v] self.val_errors = [] # will follow the same order as hyper_list for hyp in self.hyper_list: # find the right validation error for hyp! for k, v in hyper_dict.items(): all_hypers = [pair[0] for pair in as_list(v)] if hyp in all_hypers: self.val_errors.append(k) break for i, der in enumerate( self.d_dynamics_d_hypers ): # this automatic casting at the moment works only for SGD if not isinstance(der, ZMergedMatrix): print('Try casting d_dynamics_d_hyper to ZMergedMatrix') self.d_dynamics_d_hypers[i] = ZMergedMatrix(der) print('Successful') with self.w_t.graph.as_default(): # global step self.global_step = global_step or GlobalStep() self.fw_ops = self.w.assign( self.tr_dynamics) # TODO add here when hypers are sequence with tf.name_scope('direct_HO'): ''' Creates one z per hyper-parameter and assumes that each hyper-parameter is a vector ''' self.zs = [self._create_z(hyp) for hyp in self.hyper_list] self.zs_dynamics = [ optimizer.jac_z(z) + dd_dh for z, dd_dh in zip(self.zs, self.d_dynamics_d_hypers) ] print('z dynamics', self.zs_dynamics[0]) print('z', self.zs[0]) self.zs_assigns = [ z.assign(z_dyn) for z, z_dyn in zip(self.zs, self.zs_dynamics) ] self.grad_val_err = [ tf.gradients(v_e, self.w_t)[0] for v_e in self.val_errors ] assert all([ g is not None for g in self.grad_val_err ]), 'Some gradient of the validation error is None!' self.grad_wrt_hypers = [ dot(gve, z.tensor) for z, gve in zip(self.zs, self.grad_val_err) ] with tf.name_scope( 'hyper_gradients' ): # ADDED 28/3/17 keeps track of hyper-gradients as tf.Variable self.hyper_gradient_vars = [ tf.Variable(tf.zeros_like(hyp), name=simple_name(hyp)) for hyp in self.hyper_list ] self.hyper_gradients_dict = { hyp: hgv for hyp, hgv # redundant.. just for comfort .. in zip(self.hyper_list, self.hyper_gradient_vars) } self._hyper_assign_ops = [ v.assign(ght) for v, ght in zip( self.hyper_gradient_vars, self.grad_wrt_hypers) ]
def __init__(self, optimizer, hyper_dict, state_history=None, global_step=None): """ Creates a new object that computes the hyper-gradient of validation errors in reverse mode. See section 3.1 of Forward and Reverse Gradient-Based Hyperparameter Optimization (https://arxiv.org/abs/1703.01785) Note that this class only computes the hyper-gradient and does not perform hyperparameter optimization. :param optimizer: insance of Optimizer class, which contains the dynamics with which the model parameters are updated :param hyper_dict: A dictionary of `{validation_error: hyperparameter or list_of_hyperparameters}` where `validation_error` is a scalar tensor and `list_of_hyperparameters` is a list of tensorflow variables that represents the hyperparameters :param state_history: (default: empty list) state history manager: should implement methods `clear`, `append`, `__getitem__` :param global_step: optional instance of GlobalStep class """ assert isinstance(optimizer, Optimizer) self.w = optimizer.raw_w # might be variable or MergedVariable # TODO check if it works also with w as simple Variable self.w_t = MergedVariable.get_tensor(self.w) # this is always a tensor self.tr_dynamics = optimizer.dynamics assert isinstance(hyper_dict, dict), '%s not allowed type. Should be a dict of ' \ '(tf.Tensor, hyperparameters)' % hyper_dict self.val_error_dict = hyper_dict self.hyper_list = [] for k, v in hyper_dict.items(): self.hyper_list += as_list(v) self.val_error_dict[k] = as_list(v) # be sure that are all lists self.w_hist = state_history or [] with self.w_t.graph.as_default(): # global step self.global_step = global_step or GlobalStep() self._fw_ops = optimizer.assign_ops # TODO add here when hyper-parameters are sequence # backward assign ops with tf.name_scope('backward'): # equation (9) p_T = { ve: tf.gradients(ve, self.w_t)[0] for ve, hyp_list in self.val_error_dict.items() } # deltaE(s_t) self.p_dict = { ve: tf.Variable(pt, name='p') for ve, pt in p_T.items() } # for nullity check self._abs_sum_p = tf.reduce_sum( tf.stack([ tf.reduce_sum(tf.abs(p), name='l1_p') for p in self.p_dict.values() ])) # build Lagrangian function with tf.name_scope('lagrangian'): self.lagrangians_dict = { ve: dot(p, self.tr_dynamics) for ve, p in self.p_dict.items() } # TODO read below ''' In the following {if else} block there are two ways of computing the the dynamics of the update of the Lagrangian multipliers. The procedures SHOULD produce the same result, however, for some strange reason, if w is indeed a state varibale that contains auxiliary components (e.g. velocity in Momentum algorithm, ...) there is a difference in the two methods and the right one is the first one. This is possibly due to the order in wich the derivatives are taken by tensorflow, but furhter investigation is necessary. ''' # detects if some auxiliary variables are used. if isinstance(self.w, MergedVariable) and \ any([isinstance(v, MergedVariable) for v in self.w.var_list(Vl_Mode.RAW)]): state_components = self.w.var_list(Vl_Mode.TENSOR) # equation (8) self.p_dynamics = { ve: tf.concat(tf.gradients(lagrangian, state_components), 0) for ve, lagrangian in self.lagrangians_dict.items() } else: # equation (8) self.p_dynamics = { ve: tf.gradients(lagrangian, self.w_t)[0] for ve, lagrangian in self.lagrangians_dict.items() } # equation (7) self._bk_ops = [ self.p_dict[ve].assign(self.p_dynamics[ve]) for ve in self.val_error_dict ] # TODO add here when hp are sequ. with tf.name_scope('w_history_ops'): self._w_placeholder = tf.placeholder(self.w_t.dtype) self._back_hist_op = self.w.assign(self._w_placeholder) with tf.name_scope('hyper_derivatives'): # equation (10) without summation. self.hyper_derivatives = [ (self.val_error_dict[ve], tf.gradients(lagrangian, self.val_error_dict[ve])) for ve, lagrangian in self.lagrangians_dict.items() ] # list of couples (hyper_list, list of symbolic hyper_gradients) (lists are unhashable!) with tf.name_scope( 'hyper_gradients' ): # ADDED 28/3/17 keeps track of hyper-gradients as tf.Variable self._grad_wrt_hypers_placeholder = tf.placeholder( tf.float32, name='placeholder') # TODO this placeholder is not really necessary... just added to minimize the changes needed # (merge with RICCARDO) self.hyper_gradient_vars = [ tf.Variable(tf.zeros_like(hyp), name=simple_name(hyp)) for hyp in self.hyper_list ] self.hyper_gradients_dict = { hyp: hgv for hyp, hgv # redundant.. just for comfort .. in zip(self.hyper_list, self.hyper_gradient_vars) } self._hyper_assign_ops = { h: v.assign(self._grad_wrt_hypers_placeholder) for h, v in self.hyper_gradients_dict.items() }
def redivide_data(datasets, partition_proportions=None, shuffle=False, filters=None, maps=None): """ Function that redivides datasets. Can be use also to shuffle or filter or map examples. :param datasets: original datasets, instances of class Dataset (works with get_data and get_targets for compatibility with mnist datasets :param partition_proportions: (optional, default None) list of fractions that can either sum up to 1 or less then one, in which case one additional partition is created with proportion 1 - sum(partition proportions). If None it will retain the same proportion of samples found in datasets :param shuffle: (optional, default False) if True shuffles the examples :param filters: (optional, default None) filter or list of filters: functions with signature (data, target, index) -> boolean (accept or reject the sample) :param maps: (optional, default None) map or list of maps: functions with signature (data, target, index) -> (new_data, new_target) (maps the old sample to a new one, possibly also to more than one sample, for data augmentation) :return: a list of datasets of length equal to the (possibly augmented) partition_proportion """ all_data = np.vstack([get_data(d) for d in datasets]) all_labels = np.vstack([get_targets(d) for d in datasets]) all_infos = np.concatenate([d.sample_info_dicts for d in datasets]) N = len(all_data) if partition_proportions: # argument check partition_proportions = list([partition_proportions] if isinstance(partition_proportions, float) else partition_proportions) sum_proportions = sum(partition_proportions) assert sum_proportions <= 1, "partition proportions must sum up to at most one: %d" % sum_proportions if sum_proportions < 1.: partition_proportions += [1. - sum_proportions] else: partition_proportions = [1. * len(get_data(d)) / N for d in datasets] if shuffle: permutation = list(range(N)) np.random.shuffle(permutation) all_data = np.array(all_data[permutation]) all_labels = np.array(all_labels[permutation]) all_infos = np.array(all_infos[permutation]) if filters: filters = as_list(filters) data_triple = [(x, y, d) for x, y, d in zip(all_data, all_labels, all_infos)] for fiat in filters: data_triple = [xy for i, xy in enumerate(data_triple) if fiat(xy[0], xy[1], xy[2], i)] all_data = np.vstack([e[0] for e in data_triple]) all_labels = np.vstack([e[1] for e in data_triple]) all_infos = np.vstack([e[2] for e in data_triple]) if maps: maps = as_list(maps) data_triple = [(x, y, d) for x, y, d in zip(all_data, all_labels, all_infos)] for _map in maps: data_triple = [_map(xy[0], xy[1], xy[2], i) for i, xy in enumerate(data_triple)] all_data = np.vstack([e[0] for e in data_triple]) all_labels = np.vstack([e[1] for e in data_triple]) all_infos = np.vstack([e[2] for e in data_triple]) N = len(all_data) assert N == len(all_labels) calculated_partitions = reduce( lambda v1, v2: v1 + [sum(v1) + v2], [int(N * prp) for prp in partition_proportions], [0] ) calculated_partitions[-1] = N print('datasets.redivide_data:, computed partitions numbers -', calculated_partitions, 'len all', len(all_data), end=' ') new_general_info_dict = {} for data in datasets: new_general_info_dict = {**new_general_info_dict, **data.general_info_dict} new_datasets = [ Dataset(data=all_data[d1:d2], target=all_labels[d1:d2], sample_info_dicts=all_infos[d1:d2], general_info_dict=new_general_info_dict) for d1, d2 in zip(calculated_partitions, calculated_partitions[1:]) ] print('DONE') return new_datasets
def __init__(self, optimizer, hyper_dict, global_step=None, devices=None): """ Creates a new object that computes the hyper-gradient of validation errors in forward mode. See section 3.2 of Forward and Reverse Gradient-Based Hyperparameter Optimization (https://arxiv.org/abs/1703.01785) Note that this class only computes the hyper-gradient and does not perform hyperparameter optimization. :param optimizer: instance of Optimizer class, which represent the dynamics with which the model parameters are updated :param hyper_dict: A dictionary of `{validation_error: hyper_pairs_list}` where `validation_error` is a scalar tensor and `hyper_pairs_list` is single or a list of pairs (hyperparameter, derivative_of_dynamics_w.r.t hyperparameter) (matrix B_t in the paper). Unfortunately tensorflow does not computes Jacobians efficiently yet (suggestions or pointer are welcomed) :param global_step: (optional) instance of `GlobalStep` to keep track of the optimization step """ assert isinstance(optimizer, Optimizer) self.w = optimizer.raw_w # might be variable or MergedVariable (never tested on Variables actually) ... self.w_t = self.w # MergedVariable.get_tensor(self.w) # this is always a tensor self.tr_dynamics = optimizer.dynamics assert isinstance(hyper_dict, dict), '%s not allowed type. Should be a dict of (tf.Tensor,' \ 'list[(hyper-parameter, d_dynamics_d_hyper-parameter)]' % hyper_dict self.hyper_list = [] # more comfortable to use self.d_dynamics_d_hypers = [] self.hyper_dict = {} # standardizes hyper_dict parameter self._inverse_hyper_dict = {} # hyperparameter-validation error pairs for k, v in hyper_dict.items(): list_v = as_list(v) # assert isinstance(list_v[0], tuple), "Something's wrong in hyper_dict %s, at least in entry%s. Check!"\ # % (hyper_dict, list_v[0]) self.hyper_dict[k] = list_v # be sure values are lists! self._inverse_hyper_dict = { **self._inverse_hyper_dict, **{hyp: k for hyp in list_v} } self.hyper_list += [ pair[0] if isinstance(pair, (tuple, list)) else pair for pair in list_v ] self.d_dynamics_d_hypers += [ pair[1] if isinstance(pair, (tuple, list)) else optimizer.auto_d_dynamics_d_hyper( pair) # try to compute it automatically for pair in list_v ] self.val_errors = [] # will follow the same order as hyper_list for hyp in self.hyper_list: # find the right validation error for hyp! for k, v in hyper_dict.items(): all_hypers = [ pair[0] if isinstance(pair, (list, tuple)) else pair for pair in as_list(v) ] if hyp in all_hypers: self.val_errors.append(k) break for i, der in enumerate( self.d_dynamics_d_hypers ): # this automatic casting at the moment works only for SGD if not isinstance(der, ZMergedMatrix): print('Try casting d_dynamics_d_hyper to ZMergedMatrix') self.d_dynamics_d_hypers[i] = ZMergedMatrix(der) print('Successful') devices = as_list(devices) # at most will be [None] with self.w_t.graph.as_default(): # global step self.global_step = global_step or GlobalStep() self.fw_ops = optimizer.assign_ops # add here when hypers are sequence (...) with tf.name_scope('ForwardHG'): ''' Creates one z per hyper-parameter and assumes that each hyper-parameter is a vector ''' self.grad_wrt_hypers, self.zs, self.zs_dynamics, self._zs_assigns = [], [], [], [] self.hyper_gradient_vars, self._hyper_assign_ops = [], [] self.grad_val_err = { ve: tf.identity(tf.gradients(ve, self.w_t)[0], name='grad_val_err_%s' % simple_name(ve.name)) for ve in self.hyper_dict.keys() } self._gve_inv_dict = { hyp: self.grad_val_err[ve] for hyp, ve in self._inverse_hyper_dict.items() } for k, hyp in enumerate(self.hyper_list): with tf.device(devices[k % len(devices)]): self.zs.append(self._create_z(hyp)) with tf.name_scope('Z_dynamics'): self.zs_dynamics.append( optimizer.jac_z(self.zs[k]) + self.d_dynamics_d_hypers[k]) self._zs_assigns.append(self.zs[k].assign( self.zs_dynamics[k])) self.grad_wrt_hypers.append( dot(self._gve_inv_dict[hyp], self.zs[k], name='hyper_grad_wrt_h')) with tf.name_scope('hyper_gradients'): self.hyper_gradient_vars.append( tf.Variable(tf.zeros_like(hyp), name=simple_name(hyp))) self._hyper_assign_ops.append( self.hyper_gradient_vars[k].assign( self.grad_wrt_hypers[k])) # final operations self.hyper_gradients_dict = { hyp: hgv for hyp, hgv # redundant.. just for comfort .. in zip(self.hyper_list, self.hyper_gradient_vars) } # hyper-gradient check assert all([g is not None for g in self.grad_val_err]), 'Some gradient ' \ 'of the validation error is None!'