def init_from_orbit(self, period, lighttime, tref=0.0, eccen=1e-5, varpi=0.0): """Initialize the parameters based on an orbit estimate Args: period: The orbital period in units of ``time``. lighttime: The projected light travel time in units of ``time`` (:math:`a_1\,\sin(i)/c`). tref: The reference time in units of ``time``. eccen: The orbital eccentricity. varpi: The angle of the ascending node in radians. """ ops = [] ops.append(tf.assign(self.period, period)) ops.append( tf.assign(self.lighttime, lighttime + tf.zeros_like(self.lighttime))) ops.append(tf.assign(self.tref, tref)) if self.with_eccen: ops.append( tf.assign(self.eccen_param, np.log(eccen) - np.log(1.0 - eccen))) ops.append(tf.assign(self.varpi, varpi)) self.run(ops)
def _init_references(self): # print ('DeepLIFT: computing references...') sys.stdout.flush() self._deeplift_ref.clear() ops = [] # Original: Full set of operations if self.init_ref is 'default': g = tf.get_default_graph() ops_check = g.get_operations() elif self.init_ref is 'custom': ## Custom: limit operations to those downstream of the input print('Custom: limit operations to those downstream of the input') ops_check = [descendants(op) for op in self.X.consumers()] ops_check = list(set(chain.from_iterable(ops_check))) for op in ops_check: if len(op.inputs) > 0 and not op.name.startswith('gradients'): if op.type in SUPPORTED_ACTIVATIONS: ops.append(op) YR = self._session_run([o.inputs[0] for o in ops], self.baseline) for (r, op) in zip(YR, ops): self._deeplift_ref[op.name] = r # print('DeepLIFT: references ready') sys.stdout.flush()
def states2states(self, states, to_states): ops = [] for i in xrange(len(states)): copy_c = to_states[i].c.assign(states[i].c) copy_h = to_states[i].h.assign(states[i].h) ops.append(copy_c) ops.append(copy_h) return ops
def tranform2transform(self, top_status, top_status_transform, to_statuses): ops = [] copy_s = to_statuses[0].assign(top_status) copy_st = to_statuses[1].assign(top_status_transform) ops.append(copy_s) ops.append(copy_st) return ops
def _setup_graph(self): vars = tf.trainable_variables() ops = [] for v in vars: n = v.op.name if not n.startswith('discrim/'): continue logger.info("Clip {}".format(n)) ops.append(tf.assign(v, tf.clip_by_value(v, -0.01, 0.01))) self._op = tf.group(*ops, name='clip')
def output_after2before(self, beam_parent): # beam_parent : [beam_size] ops = [] for i in xrange(len(self.after_output)): o = self.after_output new_o = tf.nn.embedding_lookup(o, beam_parent) copy_o = self.before_state[i].c.assign(new_o) ops.append(copy_o) return ops
def setup_as_moving_average_of(self, src_net, beta=0.99, beta_nontrainable=0.0): assert isinstance(src_net, Network) with absolute_name_scope(self.scope): with tf.name_scope('MovingAvg'): ops = [] for name, var in self.vars.items(): if name in src_net.vars: cur_beta = beta if name in self.trainables else beta_nontrainable new_value = lerp(src_net.vars[name], var, cur_beta) ops.append(var.assign(new_value)) return tf.group(*ops)
def states2states_shuffle(self, states, to_states, beam_parent): ops = [] for i in xrange(len(states)): copy_c = self.state2state_shuffle(to_states[i].c, states[i].c, beam_parent) copy_h = self.state2state_shuffle(to_states[i].h, states[i].h, beam_parent) ops.append(copy_c) ops.append(copy_h) return ops
def relaxed_distance(rx_step): """Distance between relaxed variables and their average.""" res, ops, rx_done = [], [], {} for v in tf.trainable_variables(): if v.name[0:2] == "RX": rx_name = v.op.name[v.name.find("/") + 1:] if rx_name not in rx_done: avg, dist_loss = relaxed_average(rx_name, rx_step) res.append(dist_loss) rx_done[rx_name] = avg ops.append(v.assign(rx_done[rx_name])) return tf.add_n(res), tf.group(*ops)
def get_parents(grad, op_type): ops = list() wave = set([grad.op]) while wave: new_wave = set() for op in wave: for op in (t.op for t in op.inputs): if op.type == op_type: ops.append(op) else: new_wave.add(op) wave = new_wave return ops
def set_vars(var_to_value_dict): ops = [] feed_dict = {} for var, value in var_to_value_dict.items(): assert is_tf_expression(var) try: setter = tf.get_default_graph().get_tensor_by_name(var.name.replace(':0', '/setter:0')) # look for existing op except KeyError: with absolute_name_scope(var.name.split(':')[0]): with tf.control_dependencies(None): # ignore surrounding control_dependencies setter = tf.assign(var, tf.placeholder(var.dtype, var.shape, 'new_value'), name='setter') # create new setter ops.append(setter) feed_dict[setter.op.inputs[1]] = value run(ops, feed_dict)
def after2before(self, beam_parent): # beam_parent : [beam_size] ops = [] for i in xrange(len(self.after_state)): c = self.after_state[i].c h = self.after_state[i].h new_c = tf.nn.embedding_lookup(c, beam_parent) new_h = tf.nn.embedding_lookup(h, beam_parent) copy_c = self.before_state[i].c.assign(new_c) copy_h = self.before_state[i].h.assign(new_h) ops.append(copy_c) ops.append(copy_h) return ops
def check(self, *args, **keys): if not self.frozen: heartbeat() ops = [] seen = set() for name, action in self.commands: full = self.full_path(name) if not os.path.isdir(full): if name not in seen: seen.add(name) ops.append(name) for op in ops: self.run(op, *args, **keys) return ops
def restore(self, sess, path, index): ops = [] feed = {} for k, net in self._nets.items(): filename = os.path.join(path, "{}.l2l-{}".format(k, index)) data = pickle.load(open(filename, "rb")) vars = snt.get_variables_in_module(net) for v in vars: split = v.name.split(":")[0].split("/") module_name = split[-2] variable_name = split[-1] feed[self.restore_pl[k][module_name][variable_name]] = data[module_name][variable_name] ops.append(self.assigns[k][module_name][variable_name]) sess.run(ops, feed_dict=feed)
def sync_variables_op(mpi_rank, num_comms=2, prereduce=0): ops = list() prev = [] with tf.device("/gpu:0"): for var in tf.trainable_variables(): with tf.control_dependencies(prev): op = tf.assign( var, allreduce(var if mpi_rank == 0 else var * 0.0, num_comms=num_comms, prereduce=prereduce)) prev = [op] ops.append(op) return tf.group(*ops)
def _init_references(self): # print ('DeepLIFT: computing references...') sys.stdout.flush() self._deeplift_ref.clear() ops = [] g = tf.get_default_graph() for op in g.get_operations(): if len(op.inputs) > 0 and not op.name.startswith('gradients'): if op.type in SUPPORTED_ACTIVATIONS: ops.append(op) YR = self.session_run([o.inputs[0] for o in ops], self.baseline) for (r, op) in zip(YR, ops): self._deeplift_ref[op.name] = r # print('DeepLIFT: references ready') sys.stdout.flush()
def ft_optimizer_list(cost, opt_vars, optimizer, lrs, grad_clip=False): """Efficient optimization for fine tuning a net.""" ops = [] gvs = [] for v, l in zip(opt_vars, lrs): if grad_clip: optim = optimizer(l) gvs = optim.compute_gradients(cost, var_list=v) capped_gvs = [(tf.clip_by_norm(grad, 10.), var) if grad is not None else (grad, var) for grad, var in gvs] ops.append(optim.apply_gradients(capped_gvs)) else: ops.append(optimizer(l).minimize(cost, var_list=v)) return tf.group(*ops), gvs
def _init_references(self): # print ('DeepLIFT: computing references...') sys.stdout.flush() self._deeplift_ref.clear() ops = [] g = tf.get_default_graph() for op in g.get_operations(): if len(op.inputs) > 0 and not op.name.startswith('gradients'): if op.type in SUPPORTED_ACTIVATIONS: ops.append(op) for op in ops: r = self.session_run(op.inputs[0], self.train_x[0:0 + 1, :]) for i in range(1, self.train_x.shape[0]): r += self.session_run(op.inputs[0], self.train_x[i:i + 1, :]) self._deeplift_ref[op.name] = r / self.train_x.shape[0] # print('DeepLIFT: references ready') sys.stdout.flush()
def apply_gradients(self, grads_and_vars, global_step=None, name=None): ops = [] if not self.stage_weights: return tf.train.MomentumOptimizer( self.lr, momentum=self.momentum).apply_gradients(grads_and_vars, name) for stage, weights in self.stage_weights.items(): lr_decay = self.stage_lr_decay[stage] mom_decay = self.stage_mom_decay[stage] lr = self.lr * lr_decay mom = self.momentum * mom_decay grads_and_vars_opt = [(g, v) for g, v in grads_and_vars if v.name in weights] ops.append( tf.train.MomentumOptimizer(lr, momentum=mom).apply_gradients( grads_and_vars_opt, name)) return tf.group(ops)
def cast_variables(variables, graph=None, cache_ops=None): if graph is None: graph = get_default_graph() if cache_ops is None: cache_ops = state.cache_ops if graph not in cache_ops: cache_ops[graph] = {} cache = cache_ops[graph] ops = [] for variable in variables: if variable in cache: op = cache[variable] elif variable.dtype == dtypes.bfloat16_ref or variable.dtype == tf.bfloat16: op = tf.cast(variable, tf.float32) else: op = variable cache[variable] = op ops.append(op) return ops
def get_parents(grad, op_type): if grad.op.type == op_type: return [grad.op] ops = list() wave = set([grad.op]) while wave: new_wave = set() for op in wave: # print(op.name) # for i in op.inputs: # print(" ", i.name) # print() for op in (t.op for t in op.inputs): if op.type == op_type: ops.append(op) else: new_wave.add(op) wave = new_wave return ops
def _init_references(self): sys.stdout.flush() self._deeplift_ref.clear() ops = [] g = self.session.graph # get subgraph starting from the target node down subgraph = tf.graph_util.extract_sub_graph(g.as_graph_def(), [self.T.name.split(':')[0]]) for n in subgraph.node: op = g.get_operation_by_name(n.name) if len(op.inputs) > 0 and not op.name.startswith('gradients'): if op.type in SUPPORTED_ACTIVATIONS: ops.append(op) print(op.name) ins = [o.inputs[0] for o in ops] print('ins', ins) YR = self.session_run(ins, self.baseline) for (r, op) in zip(YR, ops): self._deeplift_ref[op.name] = r sys.stdout.flush()
def sync_globals_zero_init_op(num_comms=2, prereduce=0): ops = list() prev = [] with tf.device("/gpu:0"): for var in tf.global_variables(): if var.dtype.base_dtype not in [tf.float32, tf.float16]: cast_back = True to_reduce = tf.cast(var, tf.float32) else: to_reduce = var cast_back = False with tf.control_dependencies(prev): reduced = allreduce(to_reduce, num_comms=num_comms, prereduce=prereduce) if cast_back: reduced = tf.cast(reduced, var.dtype.base_dtype) op = tf.assign(var, reduced) prev = [op] ops.append(op) return tf.group(*ops)
def _apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr = math_ops.cast(self._lr_t, var_dtype) beta = self._beta epsilon = self._epsilon t = math_ops.cast(self.iterations + 1, var_dtype) ops = [] # Update running sum s = self.get_slot(var, 'sum') grad_sq = math_ops.square(grad) s_new = s + grad_sq ops.append(state_ops.assign(s, s_new, use_locking=self._use_locking)) # Update running counter if self._sparse_counter: n = self.get_slot(var, 'counter') n_new = n + math_ops.sign(grad_sq) ops.append( state_ops.assign(n, n_new, use_locking=self._use_locking)) else: # Counter is not sparse; just use the current timestep instead n_new = t # Compute step size average = math_ops.div_no_nan(s_new, n_new) step = grad / (epsilon + math_ops.sqrt(average)) # Update momentum if self._use_momentum: m = self.get_slot(var, 'momentum') m_new = beta * m + (1.0 - beta) * step ops.append( state_ops.assign(m, m_new, use_locking=self._use_locking)) # Bias correction lr = lr / (1.0 - pow(beta, t)) else: # No momentum; just use the current step instead m_new = step # Update parameters ops.append( state_ops.assign_sub(var, lr * m_new, use_locking=self._use_locking)) return control_flow_ops.group(*ops)
def hatt_after2before(self,beam_parent): ops = [] new_h_att = tf.nn.embedding_lookup(self.after_h_att,beam_parent) copy_op = self.before_h_att.assign(new_h_att) ops.append(copy_op) return ops
def group_lstm_grads(grads, params, scope="grouped_lstm", group_size=None): grad = None grad_idx = None for i, (g, p) in enumerate(zip(grads, params)): if scope in p.name and "kernel" in p.name: grad = g grad_idx = i break assert grad is not None # backward walk param grad to find dw MatMul ops # walk should terminate with each MatMul op ops = list() wave = set([grad.op]) while wave: new_wave = set() for op in wave: for op in (t.op for t in op.inputs): # TN MatMul ops if op.type == "MatMul" and op.get_attr( "transpose_a") and not op.get_attr("transpose_b"): ops.append(op) else: new_wave.add(op) wave = new_wave # sort op names descending and split out the lstms (if weights are shared) last_lstm = None lstms = list() ops.sort(key=lambda op: op.name, reverse=True) for op in ops: # gradients/grouped_lstm/lstm_2/step_00_grad/MatMul_1 => lstm_2 lstm = op.name.split("/")[-3] if last_lstm != lstm: lstms.insert(0, list()) last_lstm = lstm lstms[0].append(op) # we're going to be using absolute names, so clear name_scope with tf.name_scope(None): lstm_grads = list() for lstm_ops in lstms: # default dw op to one big matmul per lstm if group_size is None: group_size = len(lstm_ops) # use the lstm scope for the new ops # gradients/grouped_lstm/lstm_2/step_00_grad/MatMul_1 => gradients/grouped_lstm/lstm_2 scope = lstm_ops[-1].name.split('/') scope = '/'.join(scope[0:-2]) offset = 0 while offset < len(lstm_ops): xs = tf.concat([ op.inputs[0] for op in lstm_ops[offset:offset + group_size] ], axis=0) gs = tf.concat([ op.inputs[1] for op in lstm_ops[offset:offset + group_size] ], axis=0) mmop = tf.matmul(xs, gs, transpose_a=True, transpose_b=False, name="%s/dw_%04d" % (scope, offset)) grad = mmop if offset == 0 else ew.add( grad, mmop, name="%s/add_%04d" % (scope, offset)) offset += group_size lstm_grads.append(grad) if len(lstms) > 1: from blocksparse.ewops import add_n # gradients/grouped_lstm/lstm_2/step_00_grad/MatMul_1 => gradients/grouped_lstm scope = lstms[0][-1].name.split('/') scope = '/'.join(scope[0:-3]) grads[grad_idx] = tf.add_n(lstm_grads, name="%s/add_n" % scope) else: grads[grad_idx] = lstm_grads[0] #grads modified in place # lstm_scopes = dict() # # rediculous amount of code just to be able to re-enter a variable scope without its name being re-numbered. # # https://github.com/tensorflow/tensorflow/pull/14390 # global lstm_scopes # if scope not in lstm_scopes: # with tf.variable_scope(scope) as lstm_scope: # lstm_scopes[scope] = lstm_scope # lstm_scope = lstm_scopes[scope] # with tf.variable_scope(lstm_scope, auxiliary_name_scope=False), tf.name_scope(lstm_scope.original_name_scope): # with tf.variable_scope(weights_scope, reuse=weights_reuse): # w = tf.get_variable('kernel', shape=[in_width + width, 4 * width]) # if bias_scope is None: # b = tf.get_variable('bias', shape=[4 * width]) # if layernorm: # g = tf.get_variable('gain', shape=[4 * width]) # if bias_scope is not None: # with tf.variable_scope(bias_scope, reuse=bias_reuse): # b = tf.get_variable('bias', shape=[4 * width]) # if layernorm: # g = tf.get_variable('gain', shape=[4 * width])
def output2output(self, output, to_output): ops = [] copy_o = to_output.assign(output) ops.append(copy_o) return ops
def apply_updates(self): assert not self._updates_applied self._updates_applied = True devices = list(self._dev_grads.keys()) total_grads = sum(len(grads) for grads in self._dev_grads.values()) assert len(devices) >= 1 and total_grads >= 1 ops = [] with absolute_name_scope(self.scope): # Cast gradients to FP32 and calculate partial sum within each device. dev_grads = OrderedDict() # device => [(grad, var), ...] for dev_idx, dev in enumerate(devices): with tf.name_scope('ProcessGrads%d' % dev_idx), tf.device(dev): sums = [] for gv in zip(*self._dev_grads[dev]): assert all(v is gv[0][1] for g, v in gv) g = [tf.cast(g, tf.float32) for g, v in gv] g = g[0] if len(g) == 1 else tf.add_n(g) sums.append((g, gv[0][1])) dev_grads[dev] = sums # Sum gradients across devices. if len(devices) > 1: with tf.name_scope('SumAcrossGPUs'), tf.device(None): for var_idx, grad_shape in enumerate(self._grad_shapes): g = [dev_grads[dev][var_idx][0] for dev in devices] if np.prod( grad_shape ): # nccl does not support zero-sized tensors g = tf.contrib.nccl.all_sum(g) for dev, gg in zip(devices, g): dev_grads[dev][var_idx] = ( gg, dev_grads[dev][var_idx][1]) # Apply updates separately on each device. for dev_idx, (dev, grads) in enumerate(dev_grads.items()): with tf.name_scope('ApplyGrads%d' % dev_idx), tf.device(dev): # Scale gradients as needed. if self.use_loss_scaling or total_grads > 1: with tf.name_scope('Scale'): coef = tf.constant(np.float32(1.0 / total_grads), name='coef') coef = self.undo_loss_scaling(coef) grads = [(g * coef, v) for g, v in grads] # Check for overflows. with tf.name_scope('CheckOverflow'): grad_ok = tf.reduce_all( tf.stack([ tf.reduce_all(tf.is_finite(g)) for g, v in grads ])) # Update weights and adjust loss scaling. with tf.name_scope('UpdateWeights'): opt = self._dev_opt[dev] ls_var = self.get_loss_scaling_var(dev) if not self.use_loss_scaling: ops.append( tf.cond(grad_ok, lambda: opt.apply_gradients(grads), tf.no_op)) else: ops.append( tf.cond( grad_ok, lambda: tf.group( tf.assign_add(ls_var, self. loss_scaling_inc), opt.apply_gradients(grads)), lambda: tf.group( tf.assign_sub(ls_var, self. loss_scaling_dec)))) # Report statistics on the last device. if dev == devices[-1]: with tf.name_scope('Statistics'): ops.append( autosummary(self.id + '/learning_rate', self.learning_rate)) ops.append( autosummary(self.id + '/overflow_frequency', tf.where(grad_ok, 0, 1))) if self.use_loss_scaling: ops.append( autosummary(self.id + '/loss_scaling_log2', ls_var)) # Initialize variables and group everything into a single op. self.reset_optimizer_state() init_uninited_vars(list(self._dev_ls_var.values())) return tf.group(*ops, name='TrainingOp')
def assign_vars(distribution, variables, values): ops = [] for variable, value in zip(variables, values): ops.append(variable.assign(value)) return ops
context = create_test_xla_compile_context() context.Enter() o = a.assign(2) context.Exit() return o op = lambda x: tpu_ops.tpu_ops.collective_permute(x, [[0, 1], [1, 0], [2, 3], [3, 2], [4, 5], [5, 4], [6, 7], [7, 6]]) zz = tpu_ops.shard(op, outputs_from_all_shards=True, num_shards=8, inputs=[[tf.constant([x+1], dtype=tf.float32) for x in range(8)]]); sess.run(zz) ops = [] for core in range(8): for step in range(8): with tf.control_dependencies(ops): ops.append(tpu_ops.tpu_ops.infeed_enqueue([tf.constant(step, tf.float32)], shape=[1], device_ordinal=core)) topology = tpu_topology topology_rank = len(topology.mesh_shape) mesh_shape = topology.mesh_shape computation_shape = None computation_stride = None num_replicas = 3 if computation_shape is None: computation_shape = np.array([1] * topology_rank, dtype=np.int32) else: