def initial_state(self, inputs, time_major, batch_size = None, dtype = tf.float32, trainable = False, trainable_initializers = None, trainable_regularizers = None, state_initializer = tf.zeros_initializer()): if not batch_size: batch_size = get_batch_size(inputs, time_major) return state_initializer([batch_size] + self._spatial_size + [2 * self._num_channels], dtype = dtype)
def test(self, visual: utils.TensorOrSequence, selfbboxes: utils.TensorOrSequence, bboxes: utils.TensorOrSequence, max_len: int, eos_idx: int, **kwargs) -> utils.Tuple[torch.Tensor, torch.Tensor]: b_s = utils.get_batch_size(visual) device = utils.get_device(visual) outputs = [] log_probs = [] mask = torch.ones((b_s, ), device=device) with self.statefulness(b_s): out = None for t in range(max_len): log_probs_t = self.step(t, out, visual, selfbboxes, bboxes, None, mode='feedback', **kwargs) out = torch.max(log_probs_t, -1)[1] mask = mask * (out.squeeze(-1) != eos_idx).float() log_probs.append(log_probs_t * mask.unsqueeze(-1).unsqueeze(-1)) outputs.append(out) return torch.cat(outputs, 1), torch.cat(log_probs, 1)
def sample_rl(self, visual: utils.TensorOrSequence, selfbboxes: utils.TensorOrSequence, bboxes: utils.TensorOrSequence, max_len: int, **kwargs) -> utils.Tuple[torch.Tensor, torch.Tensor]: b_s = utils.get_batch_size(visual) outputs = [] log_probs = [] with self.statefulness(b_s): out = None for t in range(max_len): out = self.step(t, out, visual, selfbboxes, bboxes, None, mode='feedback', **kwargs) distr = distributions.Categorical(logits=out[:, 0]) out = distr.sample().unsqueeze(1) outputs.append(out) log_probs.append(distr.log_prob(out).unsqueeze(1)) return torch.cat(outputs, 1), torch.cat(log_probs, 1)
def mmd_penalty(self, sample_pz, sample_qz): opts = self.opts sigma2_p = 1. n = utils.get_batch_size(sample_qz) n = tf.cast(n, tf.int32) nf = tf.cast(n, tf.float32) norms_pz = tf.reduce_sum(tf.square(sample_pz), axis=1, keepdims=True) dotprods_pz = tf.matmul(sample_pz, sample_pz, transpose_b=True) distances_pz = norms_pz + tf.transpose(norms_pz) - 2. * dotprods_pz norms_qz = tf.reduce_sum(tf.square(sample_qz), axis=1, keepdims=True) dotprods_qz = tf.matmul(sample_qz, sample_qz, transpose_b=True) distances_qz = norms_qz + tf.transpose(norms_qz) - 2. * dotprods_qz dotprods = tf.matmul(sample_qz, sample_pz, transpose_b=True) distances = norms_qz + tf.transpose(norms_pz) - 2. * dotprods Cbase = 2. * opts['zdim'] * sigma2_p stat = 0. for scale in [.1, .2, .5, 1., 2., 5., 10.]: C = Cbase * scale res1 = C / (C + distances_qz) res1 += C / (C + distances_pz) res1 = tf.multiply(res1, 1. - tf.eye(n)) res1 = tf.reduce_sum(res1) / (nf * nf - nf) res2 = C / (C + distances) res2 = tf.reduce_sum(res2) * 2. / (nf * nf) stat += res1 - res2 return stat
def mmd_penalty(self, sample_qz, sample_pz): opts = self.opts sigma2_p = opts['pz_scale']**2 n = utils.get_batch_size(sample_qz) n = tf.cast(n, tf.int32) nf = tf.cast(n, tf.float32) norms_pz = tf.reduce_sum(tf.square(sample_pz), axis=1, keep_dims=True) dotprods_pz = tf.matmul(sample_pz, sample_pz, transpose_b=True) distances_pz = norms_pz + tf.transpose(norms_pz) - 2. * dotprods_pz norms_qz = tf.reduce_sum(tf.square(sample_qz), axis=1, keep_dims=True) dotprods_qz = tf.matmul(sample_qz, sample_qz, transpose_b=True) distances_qz = norms_qz + tf.transpose(norms_qz) - 2. * dotprods_qz dotprods = tf.matmul(sample_qz, sample_pz, transpose_b=True) distances = norms_qz + tf.transpose(norms_pz) - 2. * dotprods # k(x, y) = C / (C + ||x - y||^2) # C = tf.nn.top_k(tf.reshape(distances, [-1]), half_size).values[half_size - 1] # C += tf.nn.top_k(tf.reshape(distances_qz, [-1]), half_size).values[half_size - 1] Cbase = 2. * opts['zdim'] * sigma2_p stat = 0. for scale in [.1, .2, .5, 1., 2., 5., 10.]: C = Cbase * scale res1 = C / (C + distances_qz) res1 += C / (C + distances_pz) res1 = tf.multiply(res1, 1. - tf.eye(n)) res1 = tf.reduce_sum(res1) / (nf * nf - nf) res2 = C / (C + distances) res2 = tf.reduce_sum(res2) * 2. / (nf * nf) stat += res1 - res2 return stat
def mmd_penalty(self, sample_pz, sample_qz): opts = self.opts sigma2_p = opts['pz_scale']**2 kernel = opts['mmd_kernel'] n = utils.get_batch_size(sample_qz) n = tf.cast(n, tf.int32) nf = tf.cast(n, tf.float32) half_size = (n * n - n) / 2 norms_pz = tf.reduce_sum(tf.square(sample_pz), axis=1, keepdims=True) dotprods_pz = tf.matmul(sample_pz, sample_pz, transpose_b=True) distances_pz = norms_pz + tf.transpose(norms_pz) - 2. * dotprods_pz norms_qz = tf.reduce_sum(tf.square(sample_qz), axis=1, keepdims=True) dotprods_qz = tf.matmul(sample_qz, sample_qz, transpose_b=True) distances_qz = norms_qz + tf.transpose(norms_qz) - 2. * dotprods_qz dotprods = tf.matmul(sample_qz, sample_pz, transpose_b=True) distances = norms_qz + tf.transpose(norms_pz) - 2. * dotprods if kernel == 'RBF': # Median heuristic for the sigma^2 of Gaussian kernel sigma2_k = tf.nn.top_k(tf.reshape(distances, [-1]), half_size).values[half_size - 1] sigma2_k += tf.nn.top_k(tf.reshape(distances_qz, [-1]), half_size).values[half_size - 1] if opts['verbose']: sigma2_k = tf.Print(sigma2_k, [sigma2_k], 'Kernel width:') res1 = tf.exp(-distances_qz / 2. / sigma2_k) res1 += tf.exp(-distances_pz / 2. / sigma2_k) res1 = tf.multiply(res1, 1. - tf.eye(n)) res1 = tf.reduce_sum(res1) / (nf * nf - nf) res2 = tf.exp(-distances / 2. / sigma2_k) res2 = tf.reduce_sum(res2) * 2. / (nf * nf) stat = res1 - res2 elif kernel == 'IMQ': # k(x, y) = C / (C + ||x - y||^2) # C = tf.nn.top_k(tf.reshape(distances, [-1]), half_size).values[half_size - 1] # C += tf.nn.top_k(tf.reshape(distances_qz, [-1]), half_size).values[half_size - 1] if opts['pz'] == 'normal': Cbase = 2. * opts['zdim'] * sigma2_p elif opts['pz'] == 'sphere': Cbase = 2. elif opts['pz'] == 'uniform': # E ||x - y||^2 = E[sum (xi - yi)^2] # = zdim E[(xi - yi)^2] # = const * zdim Cbase = opts['zdim'] stat = 0. for scale in [.1, .2, .5, 1., 2., 5., 10.]: C = Cbase * scale res1 = C / (C + distances_qz) res1 += C / (C + distances_pz) res1 = tf.multiply(res1, 1. - tf.eye(n)) res1 = tf.reduce_sum(res1) / (nf * nf - nf) res2 = C / (C + distances) res2 = tf.reduce_sum(res2) * 2. / (nf * nf) stat += res1 - res2 return stat
def mmd_penalty(self, sample_qz, sample_pz): opts = self.opts sigma2_p = opts['pz_scale']**2 kernel = opts['mmd_kernel'] n = utils.get_batch_size(sample_qz) n = tf.cast(n, tf.int32) nf = tf.cast(n, tf.float32) half_size = tf.cast((n * n - n) / 2, tf.int32) distances_pz = square_dist_broadcast(sample_pz, sample_pz) distances_qz = square_dist_broadcast(sample_qz, sample_qz) distances = square_dist_broadcast(sample_qz, sample_pz) # distances_pz = self.square_dist(sample_pz, sample_pz) # distances_qz = self.square_dist(sample_qz, sample_qz) # distances = self.square_dist(sample_qz, sample_pz) if opts['mmd_kernel'] == 'RBF': # Median heuristic for the sigma^2 of Gaussian kernel sigma2_k = tf.nn.top_k(tf.reshape(distances, [-1]), half_size).values[half_size - 1] sigma2_k += tf.nn.top_k(tf.reshape(distances_qz, [-1]), half_size).values[half_size - 1] # Maximal heuristic for the sigma^2 of Gaussian kernel # sigma2_k = tf.nn.top_k(tf.reshape(distances_qz, [-1]), 1).values[0] # sigma2_k += tf.nn.top_k(tf.reshape(distances, [-1]), 1).values[0] # sigma2_k = opts['latent_space_dim'] * sigma2_p if opts['verbose']: sigma2_k = tf.Print(sigma2_k, [sigma2_k], 'Kernel width:') res1 = tf.exp(-distances_qz / 2. / sigma2_k) res1 += tf.exp(-distances_pz / 2. / sigma2_k) res1 = tf.multiply(res1, 1. - tf.eye(n)) res1 = tf.reduce_sum(res1) / (nf * nf - nf) res2 = tf.exp(-distances / 2. / sigma2_k) res2 = tf.reduce_sum(res2) * 2. / (nf * nf) stat = res1 - res2 elif opts['mmd_kernel'] == 'IMQ': Cbase = 2 * opts['zdim'] * sigma2_p stat = 0. for scale in [.1, .2, .5, 1., 2., 5., 10.]: C = Cbase * scale res1 = C / (C + distances_qz) res1 += C / (C + distances_pz) res1 = tf.multiply(res1, 1. - tf.eye(n)) res1 = tf.reduce_sum(res1) / (nf * nf - nf) res2 = C / (C + distances) res2 = tf.reduce_sum(res2) * 2. / (nf * nf) stat += res1 - res2 elif opts['mmd_kernel'] == 'RQ': stat = 0. for scale in [.1, .2, .5, 1., 2., 5., 10.]: res1 = (1. + distances_qz / scale / 2.)**(-scale) res1 += (1. + distances_pz / scale / 2.)**(-scale) res1 = tf.multiply(res1, 1. - tf.eye(n)) res1 = tf.reduce_sum(res1) / (nf * nf - nf) res2 = (1. + distances / scale / 2.)**(-scale) res2 = tf.reduce_sum(res2) * 2. / (nf * nf) stat += res1 - res2 return stat
def apply(self, visual: utils.TensorOrSequence, selfbboxes: utils.TensorOrSequence, bboxes: utils.TensorOrSequence, out_size=1, return_probs=False, **kwargs): self.b_s = utils.get_batch_size(visual) self.device = utils.get_device(visual) self.seq_mask = torch.ones((self.b_s, self.beam_size, 1), device=self.device) self.seq_logprob = torch.zeros((self.b_s, 1, 1), device=self.device) self.log_probs = [] self.selected_words = None if return_probs: self.all_log_probs = [] outputs = [] with self.model.statefulness(self.b_s): for t in range(self.max_len): if t == 0: state = None visual, selfbboxes, bboxes, state, outputs = self.iter( t, visual, selfbboxes, bboxes, state, outputs, return_probs, **kwargs) # Sort result seq_logprob, sort_idxs = torch.sort(self.seq_logprob, 1, descending=True) outputs = torch.cat(outputs, -1) outputs = torch.gather( outputs, 1, sort_idxs.expand(self.b_s, self.beam_size, self.max_len)) log_probs = torch.cat(self.log_probs, -1) log_probs = torch.gather( log_probs, 1, sort_idxs.expand(self.b_s, self.beam_size, self.max_len)) if return_probs: all_log_probs = torch.cat(self.all_log_probs, 2) all_log_probs = torch.gather( all_log_probs, 1, sort_idxs.unsqueeze(-1).expand(self.b_s, self.beam_size, self.max_len, all_log_probs.shape[-1])) outputs = outputs.contiguous()[:, :out_size] log_probs = log_probs.contiguous()[:, :out_size] if out_size == 1: outputs = outputs.squeeze(1) log_probs = log_probs.squeeze(1) if return_probs: return outputs, log_probs, all_log_probs else: return outputs, log_probs
def sinkhorn_it(opts, C): # Batch size M = utils.get_batch_size(C) # Kernel log_K = -C / opts['epsilon'] # Initialization log_v = -logsumexp(log_K, axis=1, keepdims=True) Sinkhorn = [] # Sinkhorn iterations for l in range(opts['L'] - 1): log_u = -logsumexp(log_K + log_v, axis=0, keepdims=True) Sinkhorn.append(tf.reduce_sum(tf.exp(log_u + log_K + log_v) * C)) log_v = -logsumexp(log_K + log_u, axis=1, keepdims=True) log_u = -logsumexp(log_K + log_v, axis=0, keepdims=True) Sinkhorn.append(tf.reduce_sum(tf.exp(log_u + log_K + log_v) * C)) return Sinkhorn
def sinkhorn_it_v2(opts, C): # Batch size M = utils.get_batch_size(C) # Initialization u = opts['epsilon'] * (tf.compat.v1.log(M) - logsumexp( -C / opts['epsilon'], axis=1, keepdims=True)) v = opts['epsilon'] * (tf.compat.v1.log(M) - logsumexp( (-C + u) / opts['epsilon'], axis=0, keepdims=True)) Sinkhorn = [] sinkhorn_init = tf.reduce_sum(tf.exp((-C + u + v) / opts['epsilon']) * C) Sinkhorn.append(sinkhorn_init) # Sinkhorn iterations for l in range(opts['L'] - 1): u -= opts['epsilon'] * (tf.compat.v1.log(M) + logsumexp( (-C + u + v) / opts['epsilon'], axis=1, keepdims=True)) v -= opts['epsilon'] * (tf.compat.v1.log(M) + logsumexp( (-C + u + v) / opts['epsilon'], axis=0, keepdims=True)) Sinkhorn.append( tf.reduce_sum(tf.exp((-C + u + v) / opts['epsilon']) * C)) return Sinkhorn
def total_correlation(self, z, z_mean, z_logvar): """Estimate of total correlation and dimensionwise on a batch. Based on ICML paper """ M = utils.get_batch_size(z) N = self.opts['dataset_size'] # Compute log(q(z(x_j)|x_i)) for every sample in the batch, which is a # tensor of size [batch_size, batch_size, num_latents]. In the following # comments, [batch_size, batch_size, num_latents] are indexed by [j, i, l]. log_qz_prob = utils.gaussian_log_density(tf.expand_dims(z, 1), tf.expand_dims(z_mean, 0), tf.expand_dims(z_logvar, 0)) # Compute log prod_l q(z(x_j)_l) = sum_l(log(sum_i(q(z(x_j)_l|x_i))) # + constant) for each sample in the batch, which is a vector of size # [batch_size,]. log_qz_product = tf.reduce_sum( tf.reduce_logsumexp(log_qz_prob, axis=1, keepdims=False) - tf.math.log(N * M), axis=1, keepdims=False) # Compute log(q(z(x_j))) as log(sum_i(q(z(x_j)|x_i))) + constant = # log(sum_i(prod_l q(z(x_j)_l|x_i))) + constant. log_qz = tf.reduce_logsumexp(tf.reduce_sum( log_qz_prob, axis=2, keepdims=False), axis=1, keepdims=False) - tf.math.log(N * M) # Compute log prod_l p(z_l) = sum_l(log(p(z_l))) # + constant) where p~N(0,1), for each sample in the batch, which is a vector of size # [batch_size,]. pi = tf.constant(math.pi) log_pz_product = tf.reduce_sum(-0.5 * (tf.math.log(2 * pi) + tf.square(z)), axis=1, keepdims=False) return tf.reduce_mean(log_qz), tf.reduce_mean( log_qz_product), tf.reduce_mean(log_pz_product)
def mmd_penalty(self, sample_qz, sample_pz): opts = self.opts sigma2_p = opts['pz_scale']**2 kernel = opts['mmd_kernel'] n = utils.get_batch_size(sample_qz) n = tf.cast(n, tf.int32) nf = tf.cast(n, tf.float32) half_size = (n * n - n) / 2 norms_pz = tf.reduce_sum(tf.square(sample_pz), axis=1, keep_dims=True) dotprods_pz = tf.matmul(sample_pz, sample_pz, transpose_b=True) distances_pz = norms_pz + tf.transpose(norms_pz) - 2. * dotprods_pz norms_qz = tf.reduce_sum(tf.square(sample_qz), axis=1, keep_dims=True) dotprods_qz = tf.matmul(sample_qz, sample_qz, transpose_b=True) distances_qz = norms_qz + tf.transpose(norms_qz) - 2. * dotprods_qz dotprods = tf.matmul(sample_qz, sample_pz, transpose_b=True) distances = norms_qz + tf.transpose(norms_pz) - 2. * dotprods # if opts['verbose']: # distances = tf.Print( # distances, # [tf.nn.top_k(tf.reshape(distances_qz, [-1]), 1).values[0]], # 'Maximal Qz squared pairwise distance:') # distances = tf.Print(distances, [tf.reduce_mean(distances_qz)], # 'Average Qz squared pairwise distance:') # distances = tf.Print( # distances, # [tf.nn.top_k(tf.reshape(distances_pz, [-1]), 1).values[0]], # 'Maximal Pz squared pairwise distance:') # distances = tf.Print(distances, [tf.reduce_mean(distances_pz)], # 'Average Pz squared pairwise distance:') if kernel == 'RBF': # Median heuristic for the sigma^2 of Gaussian kernel sigma2_k = tf.nn.top_k(tf.reshape(distances, [-1]), half_size).values[half_size - 1] sigma2_k += tf.nn.top_k(tf.reshape(distances_qz, [-1]), half_size).values[half_size - 1] # Maximal heuristic for the sigma^2 of Gaussian kernel # sigma2_k = tf.nn.top_k(tf.reshape(distances_qz, [-1]), 1).values[0] # sigma2_k += tf.nn.top_k(tf.reshape(distances, [-1]), 1).values[0] # sigma2_k = opts['latent_space_dim'] * sigma2_p if opts['verbose']: sigma2_k = tf.Print(sigma2_k, [sigma2_k], 'Kernel width:') res1 = tf.exp(-distances_qz / 2. / sigma2_k) res1 += tf.exp(-distances_pz / 2. / sigma2_k) res1 = tf.multiply(res1, 1. - tf.eye(n)) res1 = tf.reduce_sum(res1) / (nf * nf - nf) res2 = tf.exp(-distances / 2. / sigma2_k) res2 = tf.reduce_sum(res2) * 2. / (nf * nf) stat = res1 - res2 elif kernel == 'IMQ': # k(x, y) = C / (C + ||x - y||^2) # C = tf.nn.top_k(tf.reshape(distances, [-1]), half_size).values[half_size - 1] # C += tf.nn.top_k(tf.reshape(distances_qz, [-1]), half_size).values[half_size - 1] Cbase = 2 * opts['zdim'] * sigma2_p stat = 0. for scale in [.1, .2, .5, 1., 2., 5., 10.]: C = Cbase * scale res1 = C / (C + distances_qz) res1 += C / (C + distances_pz) res1 = tf.multiply(res1, 1. - tf.eye(n)) res1 = tf.reduce_sum(res1) / (nf * nf - nf) res2 = C / (C + distances) res2 = tf.reduce_sum(res2) * 2. / (nf * nf) stat += res1 - res2 return stat
def model_fn(features, labels, mode, params): # Get global step global_step = tf.train.get_global_step() # Construct mtf graph + mesh from params graph = mtf.Graph() mesh_shape = mtf.convert_to_shape(params["mesh_shape"]) layout_rules = mtf.convert_to_layout_rules(params["layout"]) # Mesh setup if params["use_tpu"]: var_placer, mesh_impl = simd_mesh_setup(params, mesh_shape, layout_rules) else: var_placer = None gpu_ids = params["gpu_ids"] mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl( mesh_shape, layout_rules, gpu_ids) # Trainable variable precision # Store to checkpoints in master type, train in slice type, compute in activation type if params["precision"] == "bfloat16": variable_dtype = mtf.VariableDType(master_dtype=tf.bfloat16, slice_dtype=tf.float32, activation_dtype=tf.bfloat16) else: variable_dtype = mtf.VariableDType(master_dtype=tf.float32, slice_dtype=tf.float32, activation_dtype=tf.float32) # Build mtf mesh object mesh = mtf.Mesh(graph, "my_mesh", var_placer) # Build mtf_features & seq length dict for getting number of microbatches # We need to pack inputs into a dict to pass into serialize_training_step features_dict = {"inputs": features, "labels": labels} sequence_length_dict = { "inputs": params["n_ctx"], "labels": params["n_ctx"] } params = add_mode_to_params(params, mode) batch_size = get_batch_size(params) batch_dim = mtf.Dimension("batch", batch_size) batch_dims = [batch_dim] feature_length = sequence_length_dict["inputs"] length_dim = mtf.Dimension("sequence", feature_length) mtf_features = {} for key, x in features_dict.items(): if x is not None: feature_shape = mtf.Shape(batch_dims + [length_dim]) if type(features_dict[key]) == dict: features_dict[key] = features_dict[key]["feature"] x = tf.cast(features_dict[key], tf.int32) x = tf.reshape(x, feature_shape.to_integer_list) mtf_features[key] = mtf.import_fully_replicated(mesh, x, feature_shape, name=key) # Instantiate dict for dimensions, bias, etc that can be calculated here once then passed into model other_features = {} memory_length_dim = mtf.Dimension("memory_length", length_dim.size) attn_bias = biasmask_attn_weights( mesh, length_dim, memory_length_dim, variable_dtype) if params["causal"] else None # Add attn_bias into mtf_features other_features["attn_bias"] = attn_bias # Define other Dimensions that we'll need inside the model embd_dim = mtf.Dimension("embd", params["n_embd"]) vocab_dim = mtf.Dimension("vocab", params["n_vocab"]) # We need this because gathering when both the args have the same dimension in them breaks things # This dim is specifically for the weights # This prevents the "Einsum has lhs dimension without corresponding rhs or output dimension." error embed_sequence_dim = mtf.Dimension("embed_sequence", params["n_ctx"]) other_features["embd_dim"] = embd_dim other_features["vocab_dim"] = vocab_dim other_features["embed_sequence_dim"] = embed_sequence_dim other_features["memory_length_dim"] = memory_length_dim if mode == tf.estimator.ModeKeys.PREDICT: # Set up the model for prediction inputs = mtf_features["inputs"] if params["remove_partial_sequences"] is None: params["remove_partial_sequences"] = False export = params.get("export", False) if not export: mtf_samples = sample_autoregressive( inputs, other_features=other_features, params=params, variable_dtype=variable_dtype, remove_partial_sequences=params["remove_partial_sequences"], stop_at_token=params["eos_id"], sampling_use_entmax=params['sampling_use_entmax']) else: with mtf.utils.outside_all_rewrites(): with tf.variable_scope('gpt2'): mtf_samples, loss, loss_batch = gpt2.model( mtf_features, other_features, params, mesh, variable_dtype=variable_dtype, context=None) mtf_samples = mtf.anonymize(mtf_samples) inputs = mtf.anonymize(inputs) lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=True) inputs = lowering.export_to_tf_tensor(inputs) outputs = lowering.export_to_tf_tensor(mtf_samples) predictions = {"inputs": inputs, "outputs": outputs} def scaffold_fn(): return tf.train.Scaffold( local_init_op=tf.group( tf.train.Scaffold.default_local_init_op(), lowering.copy_masters_to_slices(), name="mtf_local_init_op"), ready_op=tf.concat([ tf.report_uninitialized_variables(), resources.report_uninitialized_resources() ], axis=0, name="mtf_ready_op")) return tpu_estimator.TPUEstimatorSpec( mode=tf.estimator.ModeKeys.PREDICT, predictions=predictions, scaffold_fn=scaffold_fn, prediction_hooks=[mtf.MtfRestoreHook(lowering)]) # We're not predicting, so we better be training or evaluating assert (mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL) if mode == tf.estimator.ModeKeys.TRAIN: # Gets number of microbatches per batch for serialized training # if param tokens_per_mb_per_replica = None, this defaults to 1 and no microbatching is performed num_microbatches = int( mtf_transformer.utils.serialize_num_microbatches( batch_dim=batch_dim, sequence_length=sequence_length_dict, mesh_shape=mesh_shape, layout_rules=layout_rules, tokens_per_microbatch_per_replica=params[ "tokens_per_mb_per_replica"])) else: num_microbatches = 1 params[ "num_microbatches"] = num_microbatches # Add num microbatches to params if num_microbatches > 1: # For serialize_training_step we need to modify the model to output results in a dict def serialized_fn(mtf_features): if params["model"] == "GPT": with tf.variable_scope('gpt2'): logits, loss, loss_batch = gpt2.model( mtf_features, other_features, params, mesh, variable_dtype=variable_dtype) return { "logits": logits, "loss": loss, "loss_batch": loss_batch } else: raise Exception( f"'{params['model']}' is not a valid model - please select from [GPT]" ) # Serialize the training step - Gradients are accumulated locally and reduced once. var_grads, output_dict = mtf.serialize_training_step( mtf_features, serialized_fn, batch_dim, num_microbatches) loss = output_dict["loss"] loss_batch = output_dict["loss_batch"] logits = output_dict["logits"] else: # If we're not splitting into microbatches, return logits & loss as is if params["model"] == "GPT": with mtf.utils.outside_all_rewrites(): with tf.variable_scope('gpt2'): logits, loss, loss_batch = gpt2.model( mtf_features, other_features, params, mesh, variable_dtype=variable_dtype, context=None) else: raise Exception( f"'{params['model']}' is not a valid model - please select from [GPT]" ) # Auto layout generation if params["auto_layout"]: auto_layout(graph, mesh_shape, logits, loss) if params["auto_layout_and_mesh_shape"]: auto_layout_and_mesh_shape(graph, params["num_cores"], logits, loss) if mode == tf.estimator.ModeKeys.TRAIN: # In TRAIN mode, get optimizer if params["num_microbatches"] > 1: # If we are splitting the batch into microbatches, var grads are created in the serialize_training_step fn # So we pass them in here _, update_ops, var_grads = get_optimizer( mesh, loss, params, variable_dtype=variable_dtype, inp_var_grads=var_grads) else: # Otherwise, they are created in the get_optimizer fn, so we leave inp_var_grads blank _, update_ops, var_grads = get_optimizer( mesh, loss, params, variable_dtype=variable_dtype) # Log summaries to tensorboard mtf.scalar_summary("loss", loss) # Log gradients if in params if params["log_grads"] not in [None, False]: for g in var_grads: grad_norm = mtf.sqrt(mtf.reduce_sum(mtf.square(g))) mtf.scalar_summary("grads/norm" + g.name[:-2], grad_norm) else: # For now, we can only export fully-replicated tensors. # This has to be done before lowering or they will not be included in the graph mean_logits = mtf.reduce_mean(logits, reduced_dim=vocab_dim) max_logits = mtf.argmax(logits, vocab_dim) del logits fully_replicated_mean_logits = mtf.anonymize(mean_logits) fully_replicated_max_logits = mtf.anonymize(max_logits) fully_replicated_loss_batch = mtf.anonymize(loss_batch) # Gets & prints info about no. trainable vars in the model & dimension names get_graph_info(graph) # 'lowers' mtf tensors into a tf graph - this enables us to export results as tf tensors lowering = mtf.Lowering(graph, {mesh: mesh_impl}, autostack=True) tf_loss = lowering.export_to_tf_tensor(loss) tf_loss = tf.cast(tf_loss, tf.float32) if mode == tf.estimator.ModeKeys.TRAIN: # Use our patched version until mtf updates theirs host_call = create_host_call(params['model_path']) mtf.utils.remove_summaries() # Creates train_op tf_update_ops = [lowering.lowered_operation(op) for op in update_ops] tf_update_ops.append(tf.assign_add( global_step, 1)) # Need to manually increment global_step tf.logging.info(f"tf_update_ops: {tf_update_ops}") train_op = tf.group(tf_update_ops) else: tf_mean_logits = lowering.export_to_tf_tensor( fully_replicated_mean_logits) tf_max_logits = lowering.export_to_tf_tensor( fully_replicated_max_logits) tf_loss_batch = tf.to_float( lowering.export_to_tf_tensor(fully_replicated_loss_batch)) with mtf.utils.outside_all_rewrites(): # Copy master variables to slices. Must be called first. restore_hook = mtf.MtfRestoreHook(lowering) if mode == tf.estimator.ModeKeys.TRAIN: # Set up the checkpoint server and return the TPUEstimatorSpec saver = tf.train.Saver(tf.global_variables(), sharded=True, max_to_keep=10, keep_checkpoint_every_n_hours=2, defer_build=False, save_relative_paths=True) tf.add_to_collection(tf.GraphKeys.SAVERS, saver) saver_listener = mtf.MtfCheckpointSaverListener(lowering) saver_hook = tf.train.CheckpointSaverHook( params["model_path"], save_steps=params["steps_per_checkpoint"], saver=saver, listeners=[saver_listener]) return tpu_estimator.TPUEstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=tf_loss, host_call=host_call, train_op=train_op, training_hooks=[restore_hook, saver_hook]) elif mode == tf.estimator.ModeKeys.EVAL: # Evaluation metrics def _perplexity(loss): perplexity = tf.exp(loss) return tf.metrics.mean(perplexity) def _bits_per_byte(loss): bpb = loss * (0.29335 / math.log(2)) return tf.metrics.mean(bpb) def _metric_fn(tf_mean_logits, tf_loss_batch): mean_logits = tf.metrics.mean(tf_mean_logits) loss = tf.reduce_mean(tf_loss_batch) perp = _perplexity(loss) bpb = _bits_per_byte(loss) return { "mean_logits": mean_logits, "perplexity": perp, "bits per byte": bpb } def _lambada_metric_fn(labels, tf_max_logits, tf_loss_batch): eos_token = params["eos_id"] answer_positions = tf.where( tf.math.not_equal(labels, eos_token)) correct_answers = tf.gather_nd( tf.math.equal(tf_max_logits, labels), answer_positions) accuracy = tf.metrics.mean(tf.cast(correct_answers, tf.float32)) # I guess tf_loss_batch has z_loss and maybe other stuff added to it # so maybe this should be calculated separately in the future answer_loss = tf.gather_nd(tf_loss_batch, answer_positions) log_perplexity = tf.metrics.mean(answer_loss) return { "lambada_acc": accuracy, "lambada_log_ppl": log_perplexity } eval_task = params["eval_task"] if eval_task == "lambada": eval_metrics = (_lambada_metric_fn, [labels, tf_max_logits, tf_loss_batch]) else: eval_metrics = (_metric_fn, [tf_mean_logits, tf_loss_batch]) return tpu_estimator.TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, evaluation_hooks=[restore_hook], loss=tf_loss, eval_metrics=eval_metrics)
def mmd(opts, pi0, pi, sample_pz, sample_qz): """ Compute MMD between prior and aggregated posterior pi0: prior weights [K] pi: variational weights [batch,K] """ sigma2_p = opts['pz_scale'] ** 2 kernel = opts['mmd_kernel'] n = utils.get_batch_size(sample_pz) n = tf.cast(n, tf.int32) nf = tf.cast(n, tf.float32) half_size = tf.cast((n * n - n) / 2,tf.int32) norms_pz = tf.reduce_sum(tf.square(sample_pz), axis=-1, keepdims=True) norms_qz = tf.reduce_sum(tf.square(sample_qz), axis=-1, keepdims=True) distances_pz = square_dist(sample_pz, norms_pz, sample_pz, norms_pz) distances_qz = square_dist(sample_qz, norms_qz, sample_qz, norms_qz) distances = square_dist(sample_qz, norms_qz, sample_pz, norms_pz) if kernel == 'RBF': assert False, 'To implement' # Median heuristic for the sigma^2 of Gaussian kernel sigma2_k = tf.nn.top_k( tf.reshape(distances, [-1]), half_size).values[half_size - 1] sigma2_k += tf.nn.top_k( tf.reshape(distances_qz, [-1]), half_size).values[half_size - 1] if opts['verbose']: sigma2_k = tf.Print(sigma2_k, [sigma2_k], 'Kernel width:') # First 2 terms of the MMD self.res1 = tf.exp( - distances_qz / 2. / sigma2_k) self.res1 = tf.multiply(tf.transpose(self.res1),tf.transpose(self.enc_mixweight)) self.res1 = tf.multiply(tf.transpose(self.res1),tf.transpose(self.enc_mixweight)) self.res1 += tf.exp( - distances_pz / 2. / sigma2_k) / (opts['nmixtures']*opts['nmixtures']) # Correcting for diagonal terms self.res1_diag = tf.diag_part(tf.reduce_sum(self.res1,axis=[1,2])) self.res1 = (tf.reduce_sum(self.res1)\ - tf.reduce_sum(self.res1_diag)) / (nf * nf - nf) # Cross term of the MMD self.res2 = tf.exp( - distances / 2. / sigma2_k) self.res2 = tf.multiply(tf.transpose(self.res2),tf.transpose(self.enc_mixweight)) self.res2 = tf.transpose(self.res2) / opts['nmixtures'] self.res2 = tf.reduce_sum(self.res2) * 2. / (nf * nf) stat = self.res1 - self.res2 elif kernel == 'IMQ': # k(x, y) = C / (C + ||x - y||^2) Cbase = 2 * opts['zdim'] * sigma2_p res = 0. for scale in [.1, .2, .5, 1., 2., 5., 10.]: C = Cbase * scale # First 2 terms of the MMD res1_qz = C / (C + distances_qz) res1_qz = tf.multiply(tf.expand_dims(pi,axis=-1), tf.multiply(res1_qz,tf.transpose(pi))) res1_pz = (C / (C + distances_pz)) res1_pz = tf.multiply(res1_pz,tf.expand_dims(tf.square(pi0),axis=-1)) res1 = res1_qz + res1_pz # Correcting for diagonal terms res1_diag = tf.trace(tf.transpose(res1,perm=[1,0,2])) res1 = (tf.reduce_sum(res1,axis=[0,-1]) - res1_diag) / (nf * nf - nf) # Cross term of the MMD res2 = C / (C + distances) res2 = tf.multiply(tf.expand_dims(pi,axis=-1), tf.multiply(res2,tf.expand_dims(pi0,axis=-1))) res2 = tf.reduce_sum(res2,axis=[0,-1]) / (nf * nf) res += tf.reduce_sum(tf.div(res1 - 2. * res2,tf.square(pi0))) else: raise ValueError('%s Unknown kernel' % kernel) return res
def main(): args = parse_args() print("load the model configuration...", file=sys.stderr) print("=======================================================", file=sys.stderr) exp_config = generate_exp_config(args.net_name, args.pre_trained, args.include_fc, args.k_fold) weights_path = get_weights_path(net_name=args.net_name) net = importlib.import_module("Nets." + args.net_name) batch_size = get_batch_size(args.net_name, args.pre_trained) input_shape = get_input_shape(args.net_name, args.pre_trained) if args.pre_trained: preprocessing_function = net.preprocess_input else: preprocessing_function = None weights_filename = os.path.join(weights_path, "{}.h5".format(exp_config)) assert os.path.exists(weights_filename), print( "the model doesn't exist...", file=sys.stderr) model = load_model(weights_filename) rotation_range = AUGMENT_PARAMETERS.get('rotation_range', 0.) width_shift_range = AUGMENT_PARAMETERS.get('width_shift_range', 0.) height_shift_range = AUGMENT_PARAMETERS.get('height_shift_range', 0.) shear_range = AUGMENT_PARAMETERS.get('shear_range', 0.) zoom_range = AUGMENT_PARAMETERS.get('zoom_range', 0.) fill_mode = AUGMENT_PARAMETERS.get('fill_mode', 'nearest') cval = AUGMENT_PARAMETERS.get('cval', 0.) horizontal_flip = AUGMENT_PARAMETERS.get('horizontal_flip', True) vertical_flip = AUGMENT_PARAMETERS.get('vertical_flip', True) # output path training_predict_path = get_training_predict_path(args.net_name) test_predict_path = get_test_predict_path(args.net_name) print("load training data...", file=sys.stderr) print("=======================================================", file=sys.stderr) img, label = load_data(dataset="train") split_filename = os.path.join(DATA_DIR, "KFold_{}.npz".format(args.k_fold)) split = np.load(split_filename) test_indexes = split['test_indexes'] print("validate the model on {} samples".format(test_indexes.shape[0]), file=sys.stderr) valid_generator = ImageDataGenerator( x=img[test_indexes], y=None, batch_size=batch_size, augment=False, shuffle=False, output_shape=(input_shape[0], input_shape[1]), n_channels=input_shape[2], preprocessing_function=preprocessing_function) valid_generator_aug = ImageDataGenerator( x=img[test_indexes], y=None, batch_size=batch_size, augment=True, shuffle=False, output_shape=(input_shape[0], input_shape[1]), n_channels=input_shape[2], rotation_range=rotation_range, width_shift_range=width_shift_range, height_shift_range=height_shift_range, shear_range=shear_range, zoom_range=zoom_range, fill_mode=fill_mode, cval=cval, horizontal_flip=horizontal_flip, vertical_flip=vertical_flip, preprocessing_function=preprocessing_function, augment_prob=1.0) valid_pred = model.predict_generator(valid_generator, use_multiprocessing=True, workers=8) valid_pred_aug = np.zeros((test_indexes.shape[0], N_LABELS), dtype=np.float32) for i in range(TEST_TIME_AUGMENT): valid_pred_aug += model.predict_generator(valid_generator_aug, use_multiprocessing=True, workers=8) valid_pred = 0.5 * valid_pred + 0.5 * valid_pred_aug / TEST_TIME_AUGMENT filename = os.path.join(training_predict_path, "{}.npz".format(exp_config)) np.savez(file=filename, pred=valid_pred, label=label[test_indexes]) print("load test data...", file=sys.stderr) print("=======================================================", file=sys.stderr) x_test = load_data(dataset="test") test_generator = ImageDataGenerator( x=x_test, batch_size=batch_size, augment=False, shuffle=False, output_shape=(input_shape[0], input_shape[1]), n_channels=input_shape[2], preprocessing_function=preprocessing_function) test_generator_aug = ImageDataGenerator( x=x_test, batch_size=batch_size, augment=True, shuffle=False, output_shape=(input_shape[0], input_shape[1]), n_channels=input_shape[2], rotation_range=rotation_range, width_shift_range=width_shift_range, height_shift_range=height_shift_range, shear_range=shear_range, zoom_range=zoom_range, fill_mode=fill_mode, cval=cval, horizontal_flip=horizontal_flip, vertical_flip=vertical_flip, preprocessing_function=preprocessing_function, augment_prob=1.0) test_pred = model.predict_generator(test_generator, use_multiprocessing=True, workers=8) test_pred_aug = np.zeros((x_test.shape[0], N_LABELS), dtype=np.float32) for i in range(TEST_TIME_AUGMENT): test_pred_aug += model.predict_generator(test_generator_aug, use_multiprocessing=True, workers=8) test_pred = 0.5 * test_pred + 0.5 * test_pred_aug / TEST_TIME_AUGMENT filename = os.path.join(test_predict_path, "{}.npz".format(exp_config)) np.savez(file=filename, pred=test_pred)
def MMD(opts, resp_qz, sample_qz, resp_pz, sample_pz): """ Compute MMD between prior and aggregated posterior resp_pz: prior mixture resp. [K] resp_qz: variational mixture resp. [batch,K] sample_qz/sample_pz: latent samples [batch,K,zdim] """ K, zdim = sample_qz.get_shape().as_list()[1:] nf = tf.cast(utils.get_batch_size(sample_qz), tf.float32) half_size = tf.cast((nf * nf - nf) / 2, tf.int32) # reshape resp_pz to be broadcastable along batch dim resp_pz = tf.expand_dims(resp_pz, axis=0) #[1,K] # get pairwise distances distances_pz = square_dist(sample_pz, sample_pz) #[batch,K,K,batch] distances_qz = square_dist(sample_qz, sample_qz) #[batch,K,K,batch] distances = square_dist(sample_qz, sample_pz) #[batch,K,K,batch] if opts['mmd_kernel'] == 'RBF': # Median heuristic for the sigma^2 of Gaussian kernel [K,] sigma2_k = tf.nn.top_k(tf.reshape(distances, [-1]), half_size).values[:, half_size - 1] sigma2_k += tf.nn.top_k(tf.reshape(distances_qz, [-1]), half_size).values[:, half_size - 1] # q term res_q = tf.exp(-distances_qz / 2. / sigma2_k) resp_qz_broadcast_1 = tf.expand_dims(tf.expand_dims(resp_qz, axis=2), axis=2) #[batch,K,1,1] resp_qz_broadcast_2 = tf.expand_dims(tf.expand_dims( tf.transpose(resp_qz), axis=0), axis=0) #[1,1,K,batch] res_q *= resp_qz_broadcast_1 * resp_qz_broadcast_2 # p term res_p = tf.exp(-distances_pz / 2. / sigma2_k) resp_pz_broadcast_1 = tf.expand_dims(tf.expand_dims(resp_pz, axis=2), axis=2) #[batch,K,1,1] resp_pz_broadcast_2 = tf.expand_dims(tf.expand_dims( tf.transpose(resp_pz), axis=0), axis=0) #[1,1,K,batch] res_p *= resp_pz_broadcast_1 * resp_pz_broadcast_2 #correction term res1 = tf.reduce_sum(res_q + res_p) - tf.linalg.trace( tf.reduce_sum(res_q + res_p, axis=[1, 2])) res1 /= nf * nf - nf # cross term res_qp = tf.exp(-distances / 2. / sigma2_k) res_qp *= resp_qz_broadcast_1 * resp_pz_broadcast_2 res2 = tf.reduce_sum(res_qp) / (nf * nf) # mmd res = res1 - 2. * res2 elif opts['mmd_kernel'] == 'IMQ': # k(x, y) = C / (C + ||x - y||^2) Cbase = 2 * zdim * ((opts['x_var'] + opts['x_var']) / 2.)**2 res = 0. # for scale in [.1, .2, .5, 1., 2., 5., 10., 20., 50., 100.]: for scale in [.1, .2, .5, 1., 2., 5., 10.]: C = Cbase * scale # q term res_q = C / (C + distances_qz) resp_qz_broadcast_1 = tf.expand_dims(tf.expand_dims(resp_qz, axis=2), axis=2) #[batch,K,1,1] resp_qz_broadcast_2 = tf.expand_dims(tf.expand_dims( tf.transpose(resp_qz), axis=0), axis=0) #[1,1,K,batch] res_q *= resp_qz_broadcast_1 * resp_qz_broadcast_2 # p term res_p = C / (C + distances_pz) resp_pz_broadcast_1 = tf.expand_dims(tf.expand_dims(resp_pz, axis=2), axis=2) #[batch,K,1,1] resp_pz_broadcast_2 = tf.expand_dims(tf.expand_dims( tf.transpose(resp_pz), axis=0), axis=0) #[1,1,K,batch] res_p *= resp_pz_broadcast_1 * resp_pz_broadcast_2 #correction term res1 = tf.reduce_sum(res_q + res_p) - tf.linalg.trace( tf.reduce_sum(res_q + res_p, axis=[1, 2])) res1 /= nf * nf - nf # cross term res_qp = C / (C + distances) res_qp *= resp_qz_broadcast_1 * resp_pz_broadcast_2 res2 = tf.reduce_sum(res_qp) / (nf * nf) # mmd res += res1 - 2. * res2 else: raise ValueError('%s Unknown kernel' % opts['mmd_kernel']) return res