def resnet_model_fn_w_pruning(features, labels, mode, params): """The model_fn for ResNet-50 with pruning. Args: features: A float32 batch of images. labels: A int32 batch of labels. mode: Specifies whether training or evaluation. params: Dictionary of parameters passed to the model. Returns: A TPUEstimatorSpec for the model """ width = 1. if FLAGS.width <= 0 else FLAGS.width if isinstance(features, dict): features = features['feature'] if FLAGS.data_format == 'channels_first': assert not FLAGS.transpose_input # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) if FLAGS.transpose_input and mode != tf_estimator.ModeKeys.PREDICT: features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC # Normalize the image to zero mean and unit variance. features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype) features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype) pruning_method = params['pruning_method'] use_tpu = params['use_tpu'] log_alpha_threshold = params['log_alpha_threshold'] def build_network(): """Construct the network in the graph.""" model_pruning_method = pruning_method if pruning_method == 'scratch': model_pruning_method = 'threshold' network = resnet_model.resnet_v1_( resnet_depth=FLAGS.resnet_depth, num_classes=FLAGS.num_label_classes, # we need to construct the model with the pruning masks, but they won't # be updated if we're doing scratch training pruning_method=model_pruning_method, init_method=FLAGS.init_method, width=width, prune_first_layer=FLAGS.prune_first_layer, prune_last_layer=FLAGS.prune_last_layer, data_format=FLAGS.data_format, end_sparsity=FLAGS.end_sparsity, clip_log_alpha=FLAGS.clip_log_alpha, log_alpha_threshold=log_alpha_threshold, weight_decay=FLAGS.weight_decay) return network(inputs=features, is_training=(mode == tf_estimator.ModeKeys.TRAIN)) if FLAGS.precision == 'bfloat16': with contrib_tpu.bfloat16_scope(): logits = build_network() logits = tf.cast(logits, tf.float32) elif FLAGS.precision == 'float32': logits = build_network() if mode == tf_estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf_estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf_estimator.export.PredictOutput(predictions) }) output_dir = params['output_dir'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, FLAGS.num_label_classes) # make sure we reuse the same label smoothing parameter is we're doing # scratch / lottery ticket experiments. label_smoothing = FLAGS.label_smoothing if FLAGS.pruning_method == 'scratch': label_smoothing = float(FLAGS.load_mask_dir.split('/')[15]) loss = tf.losses.softmax_cross_entropy(logits=logits, onehot_labels=one_hot_labels, label_smoothing=label_smoothing) # Add regularization loss term loss += tf.losses.get_regularization_loss() if pruning_method == 'variational_dropout': reg_loss = utils.variational_dropout_dkl_loss( reg_scalar=FLAGS.reg_scalar, start_reg_ramp_up=FLAGS.sparsity_begin_step, end_reg_ramp_up=FLAGS.sparsity_end_step, warm_up=FLAGS.is_warm_up, use_tpu=use_tpu) loss += reg_loss tf.losses.add_loss(reg_loss, loss_collection=tf.GraphKeys.LOSSES) elif pruning_method == 'l0_regularization': reg_loss = utils.l0_regularization_loss( reg_scalar=FLAGS.reg_scalar, start_reg_ramp_up=FLAGS.sparsity_begin_step, end_reg_ramp_up=FLAGS.sparsity_end_step, warm_up=FLAGS.is_warm_up, use_tpu=use_tpu) loss += reg_loss tf.losses.add_loss(reg_loss, loss_collection=tf.GraphKeys.LOSSES) host_call = None if mode == tf_estimator.ModeKeys.TRAIN: host_call, train_op = train_function(pruning_method, loss, output_dir, use_tpu) else: train_op = None eval_metrics = None if mode == tf_estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Calculate eval metrics.""" logging.info('In metric function') eval_metrics = {} predictions = tf.cast(tf.argmax(logits, axis=1), tf.int32) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) eval_metrics['top_5_eval_accuracy'] = tf.metrics.mean(in_top_5) eval_metrics['eval_accuracy'] = tf.metrics.accuracy( labels=labels, predictions=predictions) return eval_metrics def vd_metric_fn(labels, logits, global_sparsity): eval_metrics = metric_fn(labels, logits) eval_metrics['global_sparsity'] = tf.metrics.mean(global_sparsity) return eval_metrics tensors = [labels, logits] metric_function = metric_fn if FLAGS.pruning_method == 'variational_dropout': batch_size = labels.shape[0] ones = tf.ones([batch_size, 1]) mask_metrics = utils.add_vd_pruning_summaries( threshold=FLAGS.log_alpha_threshold) tensors.append(mask_metrics['global_sparsity'] * ones) metric_function = vd_metric_fn eval_metrics = (metric_function, tensors) # define a custom scaffold function to enable initializing the mask from an # already trained checkpoint. def initialize_mask_from_ckpt(ckpt_path): """Load mask from an existing checkpoint.""" model_dir = FLAGS.output_dir already_has_ckpt = model_dir and tf.train.latest_checkpoint( model_dir) is not None if already_has_ckpt: tf.logging.info( 'Training already started on this model, not loading masks from' 'previously trained model') return reader = tf.train.NewCheckpointReader(ckpt_path) mask_names = reader.get_variable_to_shape_map().keys() mask_names = [x for x in mask_names if x.endswith('mask')] variable_map = {} for var in tf.global_variables(): var_name = var.name.split(':')[0] if var_name in mask_names: tf.logging.info('Loading mask variable from checkpoint: %s', var_name) variable_map[var_name] = var elif 'mask' in var_name: tf.logging.info( 'Cannot find mask variable in checkpoint, skipping: %s', var_name) tf.train.init_from_checkpoint(ckpt_path, variable_map) def initialize_parameters_from_ckpt(ckpt_path): """Load parameters from an existing checkpoint.""" model_dir = FLAGS.output_dir already_has_ckpt = model_dir and tf.train.latest_checkpoint( model_dir) is not None if already_has_ckpt: tf.logging.info( 'Training already started on this model, not loading masks from' 'previously trained model') return reader = tf.train.NewCheckpointReader(ckpt_path) param_names = reader.get_variable_to_shape_map().keys() param_names = [x for x in param_names if not x.endswith('mask')] variable_map = {} for var in tf.global_variables(): var_name = var.name.split(':')[0] if var_name in param_names: tf.logging.info( 'Loading parameter variable from checkpoint: %s', var_name) variable_map[var_name] = var elif 'mask' not in var_name: tf.logging.info( 'Cannot find parameter variable in checkpoint, skipping: %s', var_name) tf.train.init_from_checkpoint(ckpt_path, variable_map) if FLAGS.pruning_method == 'scratch': if FLAGS.load_mask_dir: def scaffold_fn(): initialize_mask_from_ckpt(FLAGS.load_mask_dir) if FLAGS.initial_value_checkpoint: initialize_parameters_from_ckpt( FLAGS.initial_value_checkpoint) return tf.train.Scaffold() else: raise ValueError( 'Must supply a mask directory to use scratch method') else: scaffold_fn = None return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def transformer_xl(inp_k, n_token, n_layer, d_model, n_head, d_head, d_inner, dropout, dropatt, attn_type, bi_data, initializer, is_training, mem_len=None, inp_q=None, mems=None, same_length=False, clamp_len=-1, untie_r=False, use_tpu=True, input_mask=None, perm_mask=None, seg_id=None, reuse_len=None, ff_activation='relu', target_mapping=None, use_bfloat16=False, scope='transformer', **kwargs): """ Defines a Transformer-XL computation graph with additional support for XLNet. Args: inp_k: int32 Tensor in shape [len, bsz], the input token IDs. seg_id: int32 Tensor in shape [len, bsz], the input segment IDs. input_mask: float32 Tensor in shape [len, bsz], the input mask. 0 for real tokens and 1 for padding. mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory from previous batches. The length of the list equals n_layer. If None, no memory is used. perm_mask: float32 Tensor in shape [len, len, bsz]. If perm_mask[i, j, k] = 0, i attend to j in batch k; if perm_mask[i, j, k] = 1, i does not attend to j in batch k. If None, each position attends to all the others. target_mapping: float32 Tensor in shape [num_predict, len, bsz]. If target_mapping[i, j, k] = 1, the i-th predict in batch k is on the j-th token. Only used during pretraining for partial prediction. Set to None during finetuning. inp_q: float32 Tensor in shape [len, bsz]. 1 for tokens with losses and 0 for tokens without losses. Only used during pretraining for two-stream attention. Set to None during finetuning. n_layer: int, the number of layers. d_model: int, the hidden size. n_head: int, the number of attention heads. d_head: int, the dimension size of each attention head. d_inner: int, the hidden size in feed-forward layers. ff_activation: str, "relu" or "gelu". untie_r: bool, whether to untie the biases in attention. n_token: int, the vocab size. is_training: bool, whether in training mode. use_tpu: bool, whether TPUs are used. use_bfloat16: bool, use bfloat16 instead of float32. dropout: float, dropout rate. dropatt: float, dropout rate on attention probabilities. init: str, the initialization scheme, either "normal" or "uniform". init_range: float, initialize the parameters with a uniform distribution in [-init_range, init_range]. Only effective when init="uniform". init_std: float, initialize the parameters with a normal distribution with mean 0 and stddev init_std. Only effective when init="normal". mem_len: int, the number of tokens to cache. reuse_len: int, the number of tokens in the currect batch to be cached and reused in the future. bi_data: bool, whether to use bidirectional input pipeline. Usually set to True during pretraining and False during finetuning. clamp_len: int, clamp all relative distances larger than clamp_len. -1 means no clamping. same_length: bool, whether to use the same attention length for each token. summary_type: str, "last", "first", "mean", or "attn". The method to pool the input to get a vector representation. initializer: A tf initializer. scope: scope name for the computation graph. """ tf.logging.info('memory input {}'.format(mems)) tf_float = tf.bfloat16 if use_bfloat16 else tf.float32 tf.logging.info('Use float type {}'.format(tf_float)) new_mems = [] with tf.variable_scope(scope): if untie_r: r_w_bias = tf.get_variable( 'r_w_bias', [n_layer, n_head, d_head], dtype=tf_float, initializer=initializer, ) r_r_bias = tf.get_variable( 'r_r_bias', [n_layer, n_head, d_head], dtype=tf_float, initializer=initializer, ) else: r_w_bias = tf.get_variable( 'r_w_bias', [n_head, d_head], dtype=tf_float, initializer=initializer, ) r_r_bias = tf.get_variable( 'r_r_bias', [n_head, d_head], dtype=tf_float, initializer=initializer, ) bsz = tf.shape(inp_k)[1] qlen = tf.shape(inp_k)[0] mlen = tf.shape(mems[0])[0] if mems is not None else 0 klen = mlen + qlen # Attention mask # causal attention mask if attn_type == 'uni': attn_mask = _create_mask(qlen, mlen, tf_float, same_length) attn_mask = attn_mask[:, :, None, None] elif attn_type == 'bi': attn_mask = None else: raise ValueError( 'Unsupported attention type: {}'.format(attn_type)) # data mask: input mask & perm mask if input_mask is not None and perm_mask is not None: data_mask = input_mask[None] + perm_mask elif input_mask is not None and perm_mask is None: data_mask = input_mask[None] elif input_mask is None and perm_mask is not None: data_mask = perm_mask else: data_mask = None if data_mask is not None: # all mems can be attended to mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz], dtype=tf_float) data_mask = tf.concat([mems_mask, data_mask], 1) if attn_mask is None: attn_mask = data_mask[:, :, :, None] else: attn_mask += data_mask[:, :, :, None] if attn_mask is not None: attn_mask = tf.cast(attn_mask > 0, dtype=tf_float) if attn_mask is not None: non_tgt_mask = -tf.eye(qlen, dtype=tf_float) non_tgt_mask = tf.concat( [tf.zeros([qlen, mlen], dtype=tf_float), non_tgt_mask], axis=-1, ) non_tgt_mask = tf.cast( (attn_mask + non_tgt_mask[:, :, None, None]) > 0, dtype=tf_float, ) else: non_tgt_mask = None # Word embedding word_emb_k, lookup_table, lookup_table_2 = embedding_lookup( x=inp_k, n_token=n_token, d_embed=128, hidden_size=d_model, initializer=initializer, use_tpu=use_tpu, dtype=tf_float, scope='word_embedding', ) if inp_q is not None: with tf.variable_scope('mask_emb'): mask_emb = tf.get_variable('mask_emb', [1, 1, d_model], dtype=tf_float) if target_mapping is not None: word_emb_q = tf.tile(mask_emb, [tf.shape(target_mapping)[0], bsz, 1]) else: inp_q_ext = inp_q[:, :, None] word_emb_q = (inp_q_ext * mask_emb + (1 - inp_q_ext) * word_emb_k) output_h = tf.layers.dropout(word_emb_k, dropout, training=is_training) if inp_q is not None: output_g = tf.layers.dropout(word_emb_q, dropout, training=is_training) # Segment embedding if seg_id is not None: if untie_r: r_s_bias = tf.get_variable( 'r_s_bias', [n_layer, n_head, d_head], dtype=tf_float, initializer=initializer, ) else: # default case (tie) r_s_bias = tf.get_variable( 'r_s_bias', [n_head, d_head], dtype=tf_float, initializer=initializer, ) seg_embed = tf.get_variable( 'seg_embed', [n_layer, 2, n_head, d_head], dtype=tf_float, initializer=initializer, ) # Convert `seg_id` to one-hot `seg_mat` mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32) cat_ids = tf.concat([mem_pad, seg_id], 0) # `1` indicates not in the same segment [qlen x klen x bsz] seg_mat = tf.cast( tf.logical_not(tf.equal(seg_id[:, None], cat_ids[None, :])), tf.int32, ) seg_mat = tf.one_hot(seg_mat, 2, dtype=tf_float) else: seg_mat = None # Positional encoding pos_emb = relative_positional_encoding( qlen, klen, d_model, clamp_len, attn_type, bi_data, bsz=bsz, dtype=tf_float, ) pos_emb = tf.layers.dropout(pos_emb, dropout, training=is_training) # Attention layers if mems is None: mems = [None] * n_layer name_variable_scope = 'layer_shared' for i in range(n_layer): # cache new mems new_mems.append(_cache_mem(output_h, mems[i], mem_len, reuse_len)) # segment bias if seg_id is None: r_s_bias_i = None seg_embed_i = None else: r_s_bias_i = r_s_bias if not untie_r else r_s_bias[i] seg_embed_i = seg_embed[i] with tf.variable_scope(name_variable_scope, reuse=True if i > 0 else False): if inp_q is not None: output_h, output_g = two_stream_rel_attn( h=output_h, g=output_g, r=pos_emb, r_w_bias=r_w_bias if not untie_r else r_w_bias[i], r_r_bias=r_r_bias if not untie_r else r_r_bias[i], seg_mat=seg_mat, r_s_bias=r_s_bias_i, seg_embed=seg_embed_i, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask, mems=mems[i], target_mapping=target_mapping, d_model=d_model, n_head=n_head, d_head=d_head, dropout=dropout, dropatt=dropatt, is_training=is_training, kernel_initializer=initializer, ) reuse = True else: reuse = False output_h = rel_multihead_attn( h=output_h, r=pos_emb, r_w_bias=r_w_bias if not untie_r else r_w_bias[i], r_r_bias=r_r_bias if not untie_r else r_r_bias[i], seg_mat=seg_mat, r_s_bias=r_s_bias_i, seg_embed=seg_embed_i, attn_mask=non_tgt_mask, mems=mems[i], d_model=d_model, n_head=n_head, d_head=d_head, dropout=dropout, dropatt=dropatt, is_training=is_training, kernel_initializer=initializer, reuse=reuse, ) if inp_q is not None: output_g = positionwise_ffn( inp=output_g, d_model=d_model, d_inner=d_inner, dropout=dropout, kernel_initializer=initializer, activation_type=ff_activation, is_training=is_training, ) output_h = positionwise_ffn( inp=output_h, d_model=d_model, d_inner=d_inner, dropout=dropout, kernel_initializer=initializer, activation_type=ff_activation, is_training=is_training, reuse=reuse, ) if inp_q is not None: output = tf.layers.dropout(output_g, dropout, training=is_training) else: output = tf.layers.dropout(output_h, dropout, training=is_training) return output, new_mems, lookup_table, lookup_table_2
def __init__(self, simulation, rnn_dim, rnn_cell, my_scope, num_actions, internal_states=2, learning_rate=0.0001, extra_layer=False): """The network receives the observation from both eyes, processes it #through convolutional layers, concatenates it with the internal state #and feeds it to the RNN.""" self.num_arms = len( simulation.fish.left_eye.vis_angles) # Rays for each eye self.rnn_dim = rnn_dim self.rnn_output_size = self.rnn_dim self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name='actions') self.actions_one_hot = tf.one_hot(self.actions, num_actions, dtype=tf.float32) self.prev_actions = tf.placeholder(shape=[None], dtype=tf.int32, name='prev_actions') self.prev_actions_one_hot = tf.one_hot(self.prev_actions, num_actions, dtype=tf.float32) self.internal_state = tf.placeholder(shape=[None, internal_states], dtype=tf.float32, name='internal_state') self.observation = tf.placeholder(shape=[None, 3, 2], dtype=tf.float32, name='obs') self.reshaped_observation = tf.reshape(self.observation, shape=[-1, self.num_arms, 3, 2]) self.left_eye = self.reshaped_observation[:, :, :, 0] self.right_eye = self.reshaped_observation[:, :, :, 1] # ------------ Common to Both ------------ # self.exp_keep = tf.placeholder(shape=None, dtype=tf.float32) self.Temp = tf.placeholder(shape=None, dtype=tf.float32) self.trainLength = tf.placeholder(dtype=tf.int32) self.batch_size = tf.placeholder(dtype=tf.int32, shape=[]) self.state_in = rnn_cell.zero_state(self.batch_size, tf.float32) # ------------ Normal network ------------ # self.conv1l = tf.layers.conv1d(inputs=self.left_eye, filters=16, kernel_size=16, strides=4, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv1l') self.conv2l = tf.layers.conv1d(inputs=self.conv1l, filters=8, kernel_size=8, strides=2, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv2l') self.conv3l = tf.layers.conv1d(inputs=self.conv2l, filters=8, kernel_size=4, strides=1, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv3l') self.conv4l = tf.layers.conv1d(inputs=self.conv3l, filters=64, kernel_size=4, strides=1, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv4l') self.conv1r = tf.layers.conv1d(inputs=self.right_eye, filters=16, kernel_size=16, strides=4, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv1r') self.conv2r = tf.layers.conv1d(inputs=self.conv1r, filters=8, kernel_size=8, strides=2, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv2r') self.conv3r = tf.layers.conv1d(inputs=self.conv2r, filters=8, kernel_size=4, strides=1, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv3r') self.conv4r = tf.layers.conv1d(inputs=self.conv3r, filters=64, kernel_size=4, strides=1, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv4r') # We take the output from the final convolutional layer and send it to a recurrent layer. # The input must be reshaped into [batch x trace x units] for rnn processing, # and then returned to [batch x units] when sent through the upper levels. self.conv4l_flat = tf.layers.flatten(self.conv4l) self.conv4r_flat = tf.layers.flatten(self.conv4r) self.conv_with_states = tf.concat([ self.conv4l_flat, self.conv4r_flat, self.prev_actions_one_hot, self.internal_state ], 1) self.rnn_in = tf.layers.dense( self.conv_with_states, self.rnn_dim, activation=tf.nn.relu, kernel_initializer=tf.orthogonal_initializer, trainable=True, name=my_scope + '_rnn_in') self.convFlat = tf.reshape( self.rnn_in, [self.batch_size, self.trainLength, self.rnn_dim]) self.rnn, self.rnn_state = tf.nn.dynamic_rnn( inputs=self.convFlat, cell=rnn_cell, dtype=tf.float32, initial_state=self.state_in, scope=my_scope + '_rnn', ) self.rnn = tf.reshape(self.rnn, shape=[-1, self.rnn_dim]) self.rnn_output = self.rnn if extra_layer: self.rnn_in2 = tf.layers.dense( self.rnn_output, self.rnn_dim, activation=tf.nn.relu, kernel_initializer=tf.orthogonal_initializer, trainable=True, name=my_scope + "_rnn_in_2") self.rnnFlat = tf.reshape( self.rnn_in2, [self.batch_size, self.trainLength, self.rnn_dim]) self.rnn2, self.rnn_state2 = tf.nn.dynamic_rnn( inputs=self.rnnFlat, cell=rnn_cell, dtype=tf.float32, initial_state=self.state_in, scope=my_scope + '_rnn2', name=my_scope + "_rnn2") self.rnn2 = tf.reshape(self.rnn2, shape=[-1, self.rnn_dim]) self.rnn2_output = self.rnn2 # The output from the recurrent player is then split into separate Value and Advantage streams self.streamA, self.streamV = tf.split(self.rnn2_output, 2, 1) else: self.rnn_state2 = self.rnn_state self.streamA, self.streamV = tf.split(self.rnn_output, 2, 1) self.AW = tf.Variable(tf.random_normal( [self.rnn_output_size // 2, num_actions]), name=my_scope + "aw") self.VW = tf.Variable(tf.random_normal([self.rnn_output_size // 2, 1]), name=my_scope + "vw") self.Advantage = tf.matmul(self.streamA, self.AW) self.Value = tf.matmul(self.streamV, self.VW) # ------------ Reflected network ------------ # self.ref_left_eye = tf.reverse(self.right_eye, [1]) # TODO: Note swapping here. self.ref_right_eye = tf.reverse(self.left_eye, [1]) self.conv1l_ref = tf.layers.conv1d(inputs=self.ref_left_eye, filters=16, kernel_size=16, strides=4, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv1l', reuse=True) self.conv2l_ref = tf.layers.conv1d(inputs=self.conv1l_ref, filters=8, kernel_size=8, strides=2, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv2l', reuse=True) self.conv3l_ref = tf.layers.conv1d(inputs=self.conv2l_ref, filters=8, kernel_size=4, strides=1, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv3l', reuse=True) self.conv4l_ref = tf.layers.conv1d(inputs=self.conv3l_ref, filters=64, kernel_size=4, strides=1, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv4l', reuse=True) self.conv1r_ref = tf.layers.conv1d(inputs=self.ref_right_eye, filters=16, kernel_size=16, strides=4, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv1r', reuse=True) self.conv2r_ref = tf.layers.conv1d(inputs=self.conv1r_ref, filters=8, kernel_size=8, strides=2, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv2r', reuse=True) self.conv3r_ref = tf.layers.conv1d(inputs=self.conv2r_ref, filters=8, kernel_size=4, strides=1, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv3r', reuse=True) self.conv4r_ref = tf.layers.conv1d(inputs=self.conv3r_ref, filters=64, kernel_size=4, strides=1, padding='valid', activation=tf.nn.relu, name=my_scope + '_conv4r', reuse=True) self.conv4l_flat_ref = tf.layers.flatten(self.conv4l_ref) self.conv4r_flat_ref = tf.layers.flatten(self.conv4r_ref) self.prev_actions_one_hot_rev = tf.reverse(self.prev_actions_one_hot, [1]) self.internal_state_rev = tf.reverse(self.internal_state, [1]) self.conv_with_states_ref = tf.concat([ self.conv4l_flat_ref, self.conv4r_flat_ref, self.prev_actions_one_hot_rev, self.internal_state_rev ], 1) self.rnn_in_ref = tf.layers.dense( self.conv_with_states_ref, self.rnn_dim, activation=tf.nn.relu, kernel_initializer=tf.orthogonal_initializer, trainable=True, name=my_scope + '_rnn_in', reuse=True) self.convFlat_ref = tf.reshape( self.rnn_in_ref, [self.batch_size, self.trainLength, self.rnn_dim]) self.rnn_ref, self.rnn_state_ref = tf.nn.dynamic_rnn( inputs=self.convFlat_ref, cell=rnn_cell, dtype=tf.float32, initial_state=self.state_in, scope=my_scope + '_rnn') self.rnn_ref = tf.reshape(self.rnn_ref, shape=[-1, self.rnn_dim]) self.rnn_output_ref = self.rnn_ref if extra_layer: self.rnn_in2_ref = tf.layers.dense( self.rnn_output_ref, self.rnn_dim, activation=tf.nn.relu, kernel_initializer=tf.orthogonal_initializer, trainable=True, name=my_scope + "_rnn_in_2", reuse=True) self.rnnFlat_ref = tf.reshape( self.rnn_in2_ref, [self.batch_size, self.trainLength, self.rnn_dim]) self.rnn2_ref, self.rnn_state2_ref = tf.nn.dynamic_rnn( inputs=self.rnnFlat_ref, cell=rnn_cell, dtype=tf.float32, initial_state=self.state_in, scope=my_scope + '_rnn2', name=my_scope + "_rnn2", reuse=True) self.rnn2_ref = tf.reshape(self.rnn2_ref, shape=[-1, self.rnn_dim]) self.rnn2_output_ref = self.rnn2_ref # The output from the recurrent player is then split into separate Value and Advantage streams self.streamA_ref, self.streamV_ref = tf.split( self.rnn2_output_ref, 2, 1) else: self.rnn_state2_ref = self.rnn_state_ref self.streamA_ref, self.streamV_ref = tf.split( self.rnn_output_ref, 2, 1) self.Value_ref = tf.matmul(self.streamV_ref, self.VW) self.Advantage_ref = tf.matmul(self.streamA_ref, self.AW) # Swapping rows in advantage - Note that this is specific to the current action space and order self.Advantage_ref = tf.concat([ self.Advantage_ref[0:, :][:, :1], self.Advantage_ref[0:, :][:, 2:3], self.Advantage_ref[0:, :][:, 1:2], self.Advantage_ref[0:, :][:, 3:4], self.Advantage_ref[0:, :][:, 5:6], self.Advantage_ref[0:, :][:, 4:5], self.Advantage_ref[0:, :][:, 6:7], self.Advantage_ref[0:, :][:, 8:9], self.Advantage_ref[0:, :][:, 7:8], self.Advantage_ref[0:, :][:, 9:] ], axis=1) # ------------ Integrating Normal and Reflected ------------ # self.Value_final = tf.divide(tf.add(self.Value, self.Value_ref), 2) self.Advantage_final = tf.divide( tf.add(self.Advantage, self.Advantage_ref), 2) self.salience = tf.gradients(self.Advantage_final, self.observation) # Then combine them together to get our final Q-values. self.Q_out = self.Value_final + tf.subtract( self.Advantage_final, tf.reduce_mean(self.Advantage_final, axis=1, keep_dims=True)) self.predict = tf.argmax(self.Q_out, 1) self.Q_dist = tf.nn.softmax(self.Q_out / self.Temp) # Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values. self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32) self.Q = tf.reduce_sum(tf.multiply(self.Q_out, self.actions_one_hot), axis=1) self.td_error = tf.square(self.targetQ - self.Q) # In order to only propagate accurate gradients through the network, we will mask the first # half of the losses for each trace as per Lample & Chatlot 2016 self.maskA = tf.zeros([self.batch_size, self.trainLength // 2]) self.maskB = tf.ones([self.batch_size, self.trainLength // 2]) self.mask = tf.concat([self.maskA, self.maskB], 1) self.mask = tf.reshape(self.mask, [-1]) self.loss = tf.reduce_mean(self.td_error * self.mask) self.trainer = tf.train.AdamOptimizer(learning_rate=learning_rate) self.updateModel = self.trainer.minimize(self.loss)
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add embeddings for `token_type_ids`. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Must be specified if `use_token_type` is True. token_type_vocab_size: int. The vocabulary size of `token_type_ids`. token_type_embedding_name: string. The name of the embedding table variable for token type ids. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_token_type: if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if use_position_embeddings: # Create the variable outside the assertion to avoid TF2 compatibility # issues. full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output
def build(): """Builds the Tensorflow graph.""" inputs, labels, lengths = None, None, None if mode in ('train', 'eval'): if isinstance(no_event_label, numbers.Number): label_shape = [] else: label_shape = [len(no_event_label)] inputs, labels, lengths = magenta.common.get_padded_batch( sequence_example_file_paths, hparams.batch_size, input_size, label_shape=label_shape, shuffle=mode == 'train') elif mode == 'generate': inputs = tf.placeholder(tf.float32, [hparams.batch_size, None, input_size]) if isinstance(encoder_decoder, note_seq.OneHotIndexEventSequenceEncoderDecoder): expanded_inputs = tf.one_hot( tf.cast(tf.squeeze(inputs, axis=-1), tf.int64), encoder_decoder.input_depth) else: expanded_inputs = inputs dropout_keep_prob = 1.0 if mode == 'generate' else hparams.dropout_keep_prob cell = make_rnn_cell(hparams.rnn_layer_sizes, dropout_keep_prob=dropout_keep_prob, attn_length=hparams.attn_length, residual_connections=hparams.residual_connections) initial_state = cell.zero_state(hparams.batch_size, tf.float32) outputs, final_state = tf.nn.dynamic_rnn(cell, expanded_inputs, sequence_length=lengths, initial_state=initial_state, swap_memory=True) outputs_flat = magenta.common.flatten_maybe_padded_sequences( outputs, lengths) if isinstance(num_classes, numbers.Number): num_logits = num_classes else: num_logits = sum(num_classes) logits_flat = tf_slim.layers.linear(outputs_flat, num_logits) if mode in ('train', 'eval'): labels_flat = magenta.common.flatten_maybe_padded_sequences( labels, lengths) if isinstance(num_classes, numbers.Number): softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat, logits=logits_flat) predictions_flat = tf.argmax(logits_flat, axis=1) else: logits_offsets = np.cumsum([0] + num_classes) softmax_cross_entropy = [] predictions = [] for i in range(len(num_classes)): softmax_cross_entropy.append( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_flat[:, i], logits=logits_flat[:, logits_offsets[i]: logits_offsets[i + 1]])) predictions.append( tf.argmax( logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]], axis=1)) predictions_flat = tf.stack(predictions, 1) correct_predictions = tf.to_float( tf.equal(labels_flat, predictions_flat)) event_positions = tf.to_float( tf.not_equal(labels_flat, no_event_label)) no_event_positions = tf.to_float( tf.equal(labels_flat, no_event_label)) # Compute the total number of time steps across all sequences in the # batch. For some checkpoint this will be different from the number of RNN # steps. def batch_labels_to_num_steps(batch_labels, lengths): num_steps = 0 for labels, length in zip(batch_labels, lengths): num_steps += encoder_decoder.labels_to_num_steps( labels[:length]) return np.float32(num_steps) num_steps = tf.py_func(batch_labels_to_num_steps, [labels, lengths], tf.float32) if mode == 'train': loss = tf.reduce_mean(softmax_cross_entropy) perplexity = tf.exp(loss) accuracy = tf.reduce_mean(correct_predictions) event_accuracy = ( tf.reduce_sum(correct_predictions * event_positions) / tf.reduce_sum(event_positions)) no_event_accuracy = ( tf.reduce_sum(correct_predictions * no_event_positions) / tf.reduce_sum(no_event_positions)) loss_per_step = tf.reduce_sum( softmax_cross_entropy) / num_steps perplexity_per_step = tf.exp(loss_per_step) optimizer = tf.train.AdamOptimizer( learning_rate=hparams.learning_rate) train_op = tf_slim.learning.create_train_op( loss, optimizer, clip_gradient_norm=hparams.clip_norm) tf.add_to_collection('train_op', train_op) vars_to_summarize = { 'loss': loss, 'metrics/perplexity': perplexity, 'metrics/accuracy': accuracy, 'metrics/event_accuracy': event_accuracy, 'metrics/no_event_accuracy': no_event_accuracy, 'metrics/loss_per_step': loss_per_step, 'metrics/perplexity_per_step': perplexity_per_step, } elif mode == 'eval': vars_to_summarize, update_ops = tf_slim.metrics.aggregate_metric_map( { 'loss': tf.metrics.mean(softmax_cross_entropy), 'metrics/accuracy': tf.metrics.accuracy(labels_flat, predictions_flat), 'metrics/per_class_accuracy': tf.metrics.mean_per_class_accuracy( labels_flat, predictions_flat, num_classes), 'metrics/event_accuracy': tf.metrics.recall(event_positions, correct_predictions), 'metrics/no_event_accuracy': tf.metrics.recall(no_event_positions, correct_predictions), 'metrics/loss_per_step': tf.metrics.mean(tf.reduce_sum(softmax_cross_entropy) / num_steps, weights=num_steps), }) for updates_op in update_ops.values(): tf.add_to_collection('eval_ops', updates_op) # Perplexity is just exp(loss) and doesn't need its own update op. vars_to_summarize['metrics/perplexity'] = tf.exp( vars_to_summarize['loss']) vars_to_summarize['metrics/perplexity_per_step'] = tf.exp( vars_to_summarize['metrics/loss_per_step']) for var_name, var_value in vars_to_summarize.items(): tf.summary.scalar(var_name, var_value) tf.add_to_collection(var_name, var_value) elif mode == 'generate': temperature = tf.placeholder(tf.float32, []) if isinstance(num_classes, numbers.Number): softmax_flat = tf.nn.softmax( tf.div(logits_flat, tf.fill([num_classes], temperature))) softmax = tf.reshape(softmax_flat, [hparams.batch_size, -1, num_classes]) else: logits_offsets = np.cumsum([0] + num_classes) softmax = [] for i in range(len(num_classes)): sm = tf.nn.softmax( tf.div( logits_flat[:, logits_offsets[i]:logits_offsets[i + 1]], tf.fill([num_classes[i]], temperature))) sm = tf.reshape(sm, [hparams.batch_size, -1, num_classes[i]]) softmax.append(sm) tf.add_to_collection('inputs', inputs) tf.add_to_collection('temperature', temperature) tf.add_to_collection('softmax', softmax) # Flatten state tuples for metagraph compatibility. for state in tf.nest.flatten(initial_state): tf.add_to_collection('initial_state', state) for state in tf.nest.flatten(final_state): tf.add_to_collection('final_state', state)
def batch_gather_by_one_hot(params: tf.Tensor, indices: tf.Tensor, batch_dims: Optional[int] = None, name: Optional[Text] = None) -> tf.Tensor: """Performs a batched version of gather using tf.one_hot multiplication. The first `batch_dims` dimensions of `params` and `indices` must match in shape. This is intended for TPU friendliness but comes with additional complexity costs. In particular, the materialized one-hot tensor has `lookup_size * indices.shape.num_elements()` elements. The time complexity is higher by a factor of `lookup_size` also. Args: params: <float32>[...some_batch_dims, lookup_size, ...] Tensor of values to gather from. indices: <int>[...some_batch_dims, ...index_dims...] Tensor of ids to index into `params`. Any values outside the range [0, lookup_size) will translate to 0 values in the output. batch_dims: Number of batched dimensions. Must be positive. Defaults to len(indices.shape) - 1. name: A name for the operation (optional). Returns: [indices.shape, params.shape[(batch_dims+1):]] Tensor. """ # We rename `batch_dims` to `num_batch_dims` since it refers to a single # integer rather than a list of the dimensions themselves. The argument # name is kept to match `tf.gather`. num_batch_dims = batch_dims del batch_dims with tf.name_scope(name or 'batch_gather_by_one_hot'): params = tf.convert_to_tensor(params) indices = tf.convert_to_tensor(indices) if num_batch_dims is None: num_batch_dims = len(indices.shape) - 1 if num_batch_dims <= 0: raise ValueError('`num_batch_dims` must be positive.') if len(params.shape) <= num_batch_dims: raise ValueError('`params` has too few dimensions.') if len(indices.shape) < num_batch_dims: raise ValueError('`indices` has too few dimensions.') if not params.shape[:num_batch_dims].is_compatible_with( indices.shape[:num_batch_dims]): raise ValueError('`params` and `indices` must have compatible batch ' 'dimensions.') lookup_size = tf.shape(params)[num_batch_dims] # Flatten all "index_dims" in `indices` into a single dimension. flat_indices_shape = tf.concat([tf.shape(indices)[:num_batch_dims], [-1]], 0) flat_indices = tf.reshape(indices, flat_indices_shape) one_hot_matrices = tf.one_hot(flat_indices, lookup_size, dtype=params.dtype) # Flatten all `params` dims after the "lookup_size" dimension. (If there # aren't any, then expand a final dimension.) flat_params_shape = tf.concat( [tf.shape(params)[:(num_batch_dims + 1)], [-1]], 0) flat_params = tf.reshape(params, flat_params_shape) flat_result = tf.matmul(one_hot_matrices, flat_params) output_shape = tf.concat( [tf.shape(indices), tf.shape(params)[(num_batch_dims + 1):]], 0) return tf.reshape(flat_result, output_shape)
def call(self, yesno_logits, yesno_labels, supporting_fact_logits, supporting_fact_labels, block_ids, num_replicas=None, eps=0): """Calls the layer. Args: yesno_logits: <float32>[batch_size, 3] Logits per position. supporting_fact_logits: <float32>[batch_size] Logits per position fro supporting facts classification. block_ids: <int32>[batch_size] Block IDs of every sample in the batch. num_replicas: Number of replicas to gather summaries from. If None (default) then cross-replicas summaries are not used. eps: <float> Small constant for numerical stability. Returns: total_loss: <float> """ batch_size = tf.shape(supporting_fact_logits)[0] supporting_fact_logits = tf.expand_dims(supporting_fact_logits, 1) supporting_fact_labels = tf.expand_dims(supporting_fact_labels, 1) example_mask = tf.cast(tf.expand_dims(tf.not_equal(block_ids, 0), 1), tf.float32) # (1) Aggregate block_ids across global batch. Compute cross block mask. all_block_ids = block_ids if num_replicas: all_block_ids = tpu_utils.cross_replica_concat( tensor=all_block_ids, num_replicas=num_replicas, name='block_ids_concat') # [batch_size, global_batch_size] cross_blocks_eq_mask = tf.cast( tf.equal(tf.expand_dims(block_ids, 1), tf.expand_dims(all_block_ids, 0)), tf.float32) # (2) Apply softmax over all positions in the (global) batch # across the blocks with the same `block_id`. # [batch_size, 3, 1] yes_no_span_probs = losses.cross_batch_softmax( tf.expand_dims(yesno_logits, 2), cross_blocks_eq_mask, num_replicas) yes_no_span_probs = tf.squeeze(yes_no_span_probs, 2) # [batch_size, 1] supporting_facts_probs = losses.cross_batch_softmax( tf.expand_dims(supporting_fact_logits, 2), cross_blocks_eq_mask, num_replicas) supporting_facts_probs = tf.squeeze(supporting_facts_probs, 2) # (3) Prepare one-hot labels based on annotation begins and ends supporting_fact_labels = tf.cast(supporting_fact_labels, tf.float32) # [batch_size, 3] yes_no_span_one_hot = tf.one_hot(yesno_labels, depth=3, dtype=tf.float32) yes_no_span_one_hot = yes_no_span_one_hot * supporting_fact_labels # (4) Compute the probability of the current begin / end positions across # the blocks with the same `block_id`. def mean_loss(all_losses): return tf.reduce_sum(all_losses * example_mask) / ( tf.reduce_sum(example_mask) + eps) supporting_facts_loss = -mean_loss( tf.log(supporting_facts_probs * supporting_fact_labels + eps)) yes_no_span_loss = -mean_loss( tf.log(yes_no_span_probs * yes_no_span_one_hot + eps)) return yes_no_span_loss, supporting_facts_loss
def testLoss(self): """ Tests the loss of the FasterRCNN """ # Create prediction_dict's structure prediction_dict_random = { 'rpn_prediction': {}, 'classification_prediction': { 'rcnn': { 'cls_score': None, 'bbox_offsets': None }, 'target': {}, '_debug': { 'losses': {} } } } prediction_dict_perf = { 'rpn_prediction': {}, 'classification_prediction': { 'rcnn': { 'cls_score': None, 'bbox_offsets': None }, 'target': {}, '_debug': { 'losses': {} } } } # Set seeds for stable results rand_seed = 13 target_seed = 43 image_size = (60, 80) num_anchors = 1000 config = EasyDict(self.config) config.model.rpn.l2_regularization_scale = 0.0 config.model.rcnn.l2_regularization_scale = 0.0 config.model.base_network.arg_scope.weight_decay = 0.0 # RPN # Random generation of cls_targets for rpn # where: # {-1}: Ignore # { 0}: Background # { 1}: Object rpn_cls_target = tf.floor( tf.random_uniform([num_anchors], minval=-1, maxval=2, dtype=tf.float32, seed=target_seed, name=None)) # Creation of cls_scores with: # score 100 in correct class # score 0 in wrong class # Generation of opposite cls_score for rpn rpn_cls_score = tf.cast( tf.one_hot(tf.cast(tf.mod(tf.identity(rpn_cls_target) + 1, 2), tf.int32), depth=2, on_value=10), tf.float32) # Generation of correct cls_score for rpn rpn_cls_perf_score = tf.cast( tf.one_hot(tf.cast(tf.identity(rpn_cls_target), tf.int32), depth=2, on_value=100), tf.float32) # Random generation of target bbox deltas rpn_bbox_target = tf.floor( tf.random_uniform([num_anchors, 4], minval=-1, maxval=1, dtype=tf.float32, seed=target_seed, name=None)) # Random generation of predicted bbox deltas rpn_bbox_predictions = tf.floor( tf.random_uniform([num_anchors, 4], minval=-1, maxval=1, dtype=tf.float32, seed=rand_seed, name=None)) prediction_dict_random['rpn_prediction'][ 'rpn_cls_score'] = rpn_cls_score prediction_dict_random['rpn_prediction'][ 'rpn_cls_target'] = rpn_cls_target prediction_dict_random['rpn_prediction'][ 'rpn_bbox_target'] = rpn_bbox_target prediction_dict_random['rpn_prediction'][ 'rpn_bbox_pred'] = rpn_bbox_predictions prediction_dict_perf['rpn_prediction'][ 'rpn_cls_score'] = rpn_cls_perf_score prediction_dict_perf['rpn_prediction'][ 'rpn_cls_target'] = rpn_cls_target prediction_dict_perf['rpn_prediction'][ 'rpn_bbox_target'] = rpn_bbox_target prediction_dict_perf['rpn_prediction'][ 'rpn_bbox_pred'] = rpn_bbox_target # RCNN # Set the number of classes num_classes = config.model.network.num_classes # Randomly generate the bbox_offsets for the correct class = 1 prediction_dict_random['classification_prediction']['target'] = { 'bbox_offsets': tf.random_uniform([1, 4], minval=-1, maxval=1, dtype=tf.float32, seed=target_seed, name=None), 'cls': [1] } # Set the same bbox_offsets and cls for the perfect prediction prediction_dict_perf['classification_prediction'][ 'target'] = prediction_dict_random['classification_prediction'][ 'target'].copy() # Generate random scores for the num_classes + the background class rcnn_cls_score = tf.random_uniform([1, num_classes + 1], minval=-100, maxval=100, dtype=tf.float32, seed=rand_seed, name=None) # Generate a perfect prediction with the correct class score = 100 # and the rest set to 0 rcnn_cls_perf_score = tf.cast( tf.one_hot([1], depth=num_classes + 1, on_value=100), tf.float32) # Generate the random delta prediction for each class rcnn_bbox_offsets = tf.random_uniform([1, num_classes * 4], minval=-1, maxval=1, dtype=tf.float32, seed=rand_seed, name=None) # Copy the random prediction and set the correct class prediction # as the target one target_bbox_offsets = prediction_dict_random[ 'classification_prediction']['target']['bbox_offsets'] initial_val = 1 * 4 # cls value * 4 rcnn_bbox_perf_offsets = tf.Variable( tf.reshape( tf.random_uniform([1, num_classes * 4], minval=-1, maxval=1, dtype=tf.float32, seed=target_seed, name=None), [-1])) rcnn_bbox_perf_offsets = tf.reshape( tf.scatter_update(rcnn_bbox_perf_offsets, tf.range(initial_val, initial_val + 4), tf.reshape(target_bbox_offsets, [-1])), [1, -1]) prediction_dict_random['classification_prediction']['rcnn'][ 'cls_score'] = rcnn_cls_score prediction_dict_random['classification_prediction']['rcnn'][ 'bbox_offsets'] = rcnn_bbox_offsets prediction_dict_perf['classification_prediction']['rcnn'][ 'cls_score'] = rcnn_cls_perf_score prediction_dict_perf['classification_prediction']['rcnn'][ 'bbox_offsets'] = rcnn_bbox_perf_offsets loss_perfect = self._get_losses(config, prediction_dict_perf, image_size) loss_random = self._get_losses(config, prediction_dict_random, image_size) loss_random_compare = { 'rcnn_cls_loss': 5, 'rcnn_reg_loss': 3, 'rpn_cls_loss': 5, 'rpn_reg_loss': 3, 'no_reg_loss': 16, 'regularization_loss': 0, 'total_loss': 22, } for loss in loss_random: self.assertGreaterEqual(loss_random[loss], loss_random_compare[loss], loss) self.assertEqual(loss_perfect[loss], 0, loss)
def resnet_model_fn(features, labels, mode, params): """The model_fn for ResNet to be used with TPUEstimator. Args: features: `Tensor` of batched images. If transpose_input is enabled, it is transposed to device layout and reshaped to 1D tensor. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features["feature"] # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU/TPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. if params["data_format"] == "channels_first": assert not params["transpose_input"] # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) if params["transpose_input"] and mode != tf.estimator.ModeKeys.PREDICT: image_size = tf.sqrt(tf.shape(features)[0] / (3 * tf.shape(labels)[0])) features = tf.reshape(features, [image_size, image_size, 3, -1]) features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC # Normalize the image to zero mean and unit variance. features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype) features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype) # DropBlock keep_prob for the 4 block groups of ResNet architecture. # None means applying no DropBlock at the corresponding block group. dropblock_keep_probs = [None] * 4 if params["dropblock_groups"]: # Scheduled keep_prob for DropBlock. train_steps = tf.cast(params["train_steps"], tf.float32) current_step = tf.cast(tf.train.get_global_step(), tf.float32) current_ratio = current_step / train_steps dropblock_keep_prob = 1 - current_ratio * ( 1 - params["dropblock_keep_prob"]) # Computes DropBlock keep_prob for different block groups of ResNet. dropblock_groups = [ int(x) for x in params["dropblock_groups"].split(",") ] for block_group in dropblock_groups: if block_group < 1 or block_group > 4: raise ValueError( "dropblock_groups should be a comma separated list of integers " "between 1 and 4 (dropblcok_groups: {}).".format( params["dropblock_groups"])) dropblock_keep_probs[block_group - 1] = 1 - ( (1 - dropblock_keep_prob) / 4.0**(4 - block_group)) # This nested function allows us to avoid duplicating the logic which # builds the network, for different values of --precision. def build_network(): network = resnet_model.resnet_v1( resnet_depth=params["resnet_depth"], num_classes=params["num_label_classes"], dropblock_size=params["dropblock_size"], dropblock_keep_probs=dropblock_keep_probs, data_format=params["data_format"], ) return network(inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) if params["precision"] == "bfloat16": with tf.tpu.bfloat16_scope(): logits = build_network() logits = tf.cast(logits, tf.float32) elif params["precision"] == "float32": logits = build_network() if mode == tf.estimator.ModeKeys.PREDICT: predictions = { "classes": tf.argmax(logits, axis=1), "probabilities": tf.nn.softmax(logits, name="softmax_tensor"), } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ "classify": tf.estimator.export.PredictOutput(predictions) }, ) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params["batch_size"] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, params["num_label_classes"]) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=params["label_smoothing"], ) # Add weight decay to the loss for non-batch-normalization variables. if params["enable_lars"]: loss = cross_entropy else: loss = cross_entropy + params["weight_decay"] * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if "batch_normalization" not in v.name ]) host_call = None if mode == tf.estimator.ModeKeys.TRAIN: # Compute the current epoch and associated learning rate from global_step. global_step = tf.train.get_global_step() steps_per_epoch = params["num_train_images"] / params[ "train_batch_size"] current_epoch = tf.cast(global_step, tf.float32) / steps_per_epoch # LARS is a large batch optimizer. LARS enables higher accuracy at batch 16K # and larger batch sizes. if params["enable_lars"]: learning_rate = 0.0 optimizer = lars_util.init_lars_optimizer(current_epoch, params) else: learning_rate = learning_rate_schedule(params, current_epoch) optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=params["momentum"], use_nesterov=True, ) if params["use_tpu"]: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if not params["skip_host_call"]: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] # Host call fns are executed params['iterations_per_loop'] times after # one TPU loop is finished, setting max_queue value to the same as # number of iterations will make the summary writer only flush the data # to storage once per loop. with tf2.summary.create_file_writer( FLAGS.model_dir, max_queue=params["iterations_per_loop"]).as_default(): with tf2.summary.record_if(True): tf2.summary.scalar("loss", loss[0], step=gs) tf2.summary.scalar("learning_rate", lr[0], step=gs) tf2.summary.scalar("current_epoch", ce[0], step=gs) return tf.summary.all_v2_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { "top_1_accuracy": top_1_accuracy, "top_5_accuracy": top_5_accuracy } eval_metrics = (metric_fn, [labels, logits]) return tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, )
def train(model_path, learning_rate, epoch, noisy=False): total_epoch = epoch teacher = nin() student = lenet() if noisy == True: drop_scale = 1 / Nratio noisy_mask = tf.nn.dropout(tf.constant( np.float32(np.ones((batch_size, 1))) / drop_scale), keep_prob=Nratio) #(batchsize,1) gaussian = tf.random_normal(shape=[batch_size, 1], mean=0.0, stddev=Nsigma) noisy = tf.mul(noisy_mask, gaussian) #noisy_add = tf.add(tf.constant(np.float32(np.ones((batch_size,1)))), noisy) teacher = tf.mul(teacher, tf.tile(noisy, tf.constant([1, 10]))) #(batchsize,10) #teacher = tf.add(teacher, tf.tile(noisy,tf.constant([1,10]))) print(bcolors.G + "prepare for training, noisy mode" + bcolors.END) tf_loss = tf.nn.l2_loss(teacher - student) / batch_size elif KD == True: # correct Hinton method at 2017.1.3 print(bcolors.G + "prepare for training, knowledge distilling mode" + bcolors.END) one_hot = tf.one_hot(y, n_classes, 1.0, 0.0) #one_hot = tf.cast(one_hot_int, tf.float32) teacher_tau = tf.scalar_mul(1.0 / tau, teacher) student_tau = tf.scalar_mul(1.0 / tau, student) objective1 = tf.nn.sigmoid_cross_entropy_with_logits( student_tau, one_hot) objective2 = tf.scalar_mul(0.5, tf.square(student_tau - teacher_tau)) tf_loss = (lamda * tf.reduce_sum(objective1) + (1 - lamda) * tf.reduce_sum(objective2)) / batch_size else: print(bcolors.G + "prepare for training, NIPS2014 mode" + bcolors.END) tf_loss = tf.nn.l2_loss(teacher - student) / batch_size optimizer1 = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(tf_loss) optimizer2 = tf.train.AdamOptimizer(learning_rate=learning_rate / 10).minimize(tf_loss) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) sess = tf.InteractiveSession(config=tf.ConfigProto( gpu_options=gpu_options, allow_soft_placement=True)) tf.initialize_all_variables().run() with tf.device('/cpu:0'): saver = tf.train.Saver(max_to_keep=100) #saver.restore(sess, os.path.join(model_path,'model-99') data, label = read_cifar10('train') index = np.array(range(len(data))) # index randomly ordered mean = cal_mean() begin = time.time() iterations = len(data) // batch_size decay_step = int(total_epoch * 0.8) cnt = 0 dropout_rate = dropout print(bcolors.G + "number of iterations (per epoch) =" + str(len(data) / batch_size) + bcolors.END) for i in range(total_epoch): np.random.shuffle(index) cost_sum = 0 for j in range(iterations): batch_x = np.float32( data[index[j * batch_size:(j + 1) * batch_size]]) - mean batch_y = np.squeeze( np.float32(label[index[j * batch_size:(j + 1) * batch_size]])) if cnt / decay_step == 0: lr = learning_rate _, cost = sess.run([optimizer1, tf_loss], feed_dict={ x: batch_x, y: batch_y, keep_prob: 1 - dropout_rate }) elif cnt / decay_step == 1: lr = learning_rate / 10 _, cost = sess.run([optimizer2, tf_loss], feed_dict={ x: batch_x, y: batch_y, keep_prob: 1 - dropout_rate }) cost_sum += cost #pdb.set_trace() #if (j % int(iterations*0.25) == 0): # print(("epoch %d-iter %d, cost = %f , avg-cost = %f"%(i, j, cost, cost/n_classes)) # sys.stdout.flush() cnt += 1 avg_time = time.time() - begin print( "epoch %d - avg. %f seconds in each epoch, lr = %.0e, cost = %f , avg-cost-per-logits = %f" % (i, avg_time / cnt, lr, cost_sum, cost_sum / iterations / n_classes)) if np.mod(i + 1, 10) == 0: print("Epoch ", i + 1, " is done. Saving the model ...") with tf.device('/cpu:0'): if not os.path.exists(model_path): os.makedirs(model_path) saver.save(sess, os.path.join(model_path, 'model'), global_step=i) sys.stdout.flush()
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = features["masked_lm_weights"] next_sentence_labels = features["next_sentence_labels"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) if bert_teacher_config is None: model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, use_einsum=use_einsum) label_ids = tf.reshape(masked_lm_ids, [-1]) true_labels = tf.one_hot( label_ids, depth=bert_config.vocab_size, dtype=model.get_sequence_output().dtype) one_hot_labels = true_labels else: model = modeling.BertModel( config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, use_einsum=use_einsum) with tf.variable_scope("teacher"): teacher_model = modeling.BertModel( config=bert_teacher_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, use_einsum=use_einsum) label_ids = tf.reshape(masked_lm_ids, [-1]) true_labels = tf.one_hot( label_ids, depth=bert_config.vocab_size, dtype=model.get_sequence_output().dtype) teacher_logits = get_logits( bert_teacher_config, distill_temperature * teacher_model.get_sequence_output(), teacher_model.get_embedding_table(), masked_lm_positions) teacher_labels = tf.nn.softmax(teacher_logits, axis=-1) if distill_ground_truth_ratio == 1.0: one_hot_labels = true_labels else: one_hot_labels = ( teacher_labels * (1 - distill_ground_truth_ratio) + true_labels * distill_ground_truth_ratio) teacher_attentions = teacher_model.get_all_attention_maps() student_attentions = model.get_all_attention_maps() teacher_hiddens = teacher_model.get_all_encoder_layers() student_hiddens = model.get_all_encoder_layers() (masked_lm_loss, _, masked_lm_example_loss, masked_lm_log_probs, _) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, tf.stop_gradient(one_hot_labels), true_labels, masked_lm_weights) (next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs) = get_next_sentence_output( bert_config, model.get_pooled_output(), next_sentence_labels) extra_loss1 = 0.0 extra_loss2 = 0.0 extra_loss3 = 0.0 extra_loss4 = 0.0 scalars_to_summarize = {} def get_layerwise_gate(layer_id): steps_per_phase = num_train_steps // bert_config.num_hidden_layers layer_wise_gate = distill_util.layer_wise_learning_rate( layer_id=layer_id, steps_per_phase=steps_per_phase, binary=True) return layer_wise_gate if layer_wise_warmup and hidden_distill_factor != 0.0: layer_id = 0 for teacher_hidden, student_hidden in ( zip(teacher_hiddens[1:], student_hiddens[1:])): with tf.variable_scope("hidden_distill_%d" % layer_id): mse_loss = tf.losses.mean_squared_error( tf.stop_gradient( contrib_layers.layer_norm( inputs=teacher_hidden, begin_norm_axis=-1, begin_params_axis=-1, trainable=False)), contrib_layers.layer_norm( inputs=student_hidden, begin_norm_axis=-1, begin_params_axis=-1, trainable=False)) layer_wise_gate = get_layerwise_gate(layer_id) extra_loss1 += layer_wise_gate * mse_loss layer_id += 1 extra_loss1 = extra_loss1 * hidden_distill_factor / layer_id if layer_wise_warmup and ( beta_distill_factor != 0 and gamma_distill_factor != 0.0): layer_id = 0 for teacher_hidden, student_hidden in ( zip(teacher_hiddens[1:], student_hiddens[1:])): with tf.variable_scope("hidden_distill_%d" % layer_id): teacher_mean = tf.reduce_mean( teacher_hiddens, axis=[-1], keepdims=True) student_mean = tf.reduce_mean( student_hidden, axis=[-1], keepdims=True) teacher_variance = tf.reduce_mean( tf.squared_difference(teacher_hiddens, teacher_mean), axis=[-1], keepdims=True) student_variance = tf.reduce_mean( tf.squared_difference(student_hidden, student_mean), axis=[-1], keepdims=True) beta_distill_loss = tf.reduce_mean( tf.squared_difference( tf.stop_gradient(teacher_mean), student_mean)) gamma_distill_loss = tf.reduce_mean( tf.abs(tf.stop_gradient(teacher_variance) - student_variance)) layer_wise_gate = get_layerwise_gate(layer_id) extra_loss3 += layer_wise_gate * beta_distill_loss extra_loss4 += layer_wise_gate * gamma_distill_loss layer_id += 1 extra_loss3 = extra_loss3 * beta_distill_factor / layer_id extra_loss4 = extra_loss4 * gamma_distill_factor / layer_id if layer_wise_warmup and attention_distill_factor != 0.0: layer_id = 0 for teacher_attention, student_attention in ( zip(teacher_attentions, student_attentions)): with tf.variable_scope("attention_distill_%d" % layer_id): teacher_attention_prob = tf.nn.softmax( teacher_attention, axis=-1) student_attention_log_prob = tf.nn.log_softmax( student_attention, axis=-1) kl_divergence = - ( tf.stop_gradient(teacher_attention_prob) * student_attention_log_prob) kl_divergence = tf.reduce_mean(tf.reduce_sum(kl_divergence, axis=-1)) layer_wise_gate = get_layerwise_gate(layer_id) extra_loss2 += layer_wise_gate * kl_divergence layer_id += 1 extra_loss2 = extra_loss2 * attention_distill_factor / layer_id if layer_wise_warmup: total_loss = extra_loss1 + extra_loss2 + extra_loss3 + extra_loss4 else: total_loss = masked_lm_loss + next_sentence_loss if summary_dir is not None: if layer_wise_warmup: scalars_to_summarize["feature_map_transfer_loss"] = extra_loss1 scalars_to_summarize["attention_transfer_loss"] = extra_loss2 scalars_to_summarize["mean_transfer_loss"] = extra_loss3 scalars_to_summarize["variance_transfer_loss"] = extra_loss4 else: scalars_to_summarize["masked_lm_loss"] = masked_lm_loss scalars_to_summarize["next_sentence_loss"] = next_sentence_loss masked_lm_predictions = tf.argmax( masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_accuracy = tf.cast(tf.math.equal( tf.reshape(masked_lm_ids, [-1]), tf.reshape(masked_lm_predictions, [-1])), tf.float32) numerator = tf.reduce_sum( tf.reshape(masked_lm_weights, [-1]) * masked_lm_accuracy) denominator = tf.reduce_sum(masked_lm_weights) + 1e-5 masked_lm_accuracy = numerator / denominator scalars_to_summarize["masked_lm_accuracy"] = masked_lm_accuracy next_sentence_predictions = tf.argmax( next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_accuracy = tf.reduce_mean( tf.cast(tf.math.equal( tf.reshape(next_sentence_labels, [-1]), tf.reshape(next_sentence_predictions, [-1])), tf.float32)) scalars_to_summarize["next_sentence_accuracy"] = next_sentence_accuracy scalars_to_summarize["global_step"] = tf.train.get_or_create_global_step() scalars_to_summarize["loss"] = total_loss host_call = None if summary_dir is not None: if use_tpu: for name in scalars_to_summarize: scalars_to_summarize[name] = tf.reshape( scalars_to_summarize[name], [1]) def host_call_fn(*args): """Host call function to compute training summaries.""" scalars = _list_to_dicts(args, scalars_to_summarize.keys())[0] for name in scalars: scalars[name] = scalars[name][0] with contrib_summary.create_file_writer( summary_dir, max_queue=1000).as_default(): with contrib_summary.always_record_summaries(): for name, value in scalars.items(): if name not in ["global_step"]: contrib_summary.scalar( name, value, step=scalars["global_step"]) return contrib_summary.all_summary_ops() host_call = (host_call_fn, _dicts_to_list([scalars_to_summarize], scalars_to_summarize.keys())) else: for name in scalars_to_summarize: tf.summary.scalar(name, scalars_to_summarize[name]) tvars = tf.trainable_variables() initialized_variable_names = {} teacher_initialized_variable_names = {} scaffold_fn = None if init_checkpoint: if not init_from_teacher: # Initializes from the checkpoint for all variables. (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) elif bert_teacher_config is not None: # Initializes from the pre-trained checkpoint only for teacher model # and embeddings for distillation. (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint, init_embedding=True) (teacher_assignment_map, teacher_initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint, init_from_teacher=True) if use_tpu: def teacher_tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.train.init_from_checkpoint(init_checkpoint, teacher_assignment_map) return tf.train.Scaffold() scaffold_fn = teacher_tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.train.init_from_checkpoint(init_checkpoint, teacher_assignment_map) tf.logging.info("**** Trainable Variables ****") total_size = 0 for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" if var.name in teacher_initialized_variable_names: init_string = ", *INIT_FROM_TEACHER_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) if not var.name.startswith("teacher"): total_size += functools.reduce(lambda x, y: x * y, var.get_shape().as_list()) tf.logging.info(" total variable parameters: %d", total_size) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: if layer_wise_warmup: train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, optimizer, end_lr_rate=1.0, use_layer_wise_warmup=True, total_warmup_phases=bert_config.num_hidden_layers) else: train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, optimizer) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn, host_call=host_call) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape(masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax( masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) next_sentence_log_probs = tf.reshape( next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) next_sentence_predictions = tf.argmax( next_sentence_log_probs, axis=-1, output_type=tf.int32) next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) next_sentence_accuracy = tf.metrics.accuracy( labels=next_sentence_labels, predictions=next_sentence_predictions) next_sentence_mean_loss = tf.metrics.mean( values=next_sentence_example_loss) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, "next_sentence_accuracy": next_sentence_accuracy, "next_sentence_loss": next_sentence_mean_loss, } eval_metrics = (metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels ]) output_spec = tf.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) return output_spec
def cnn_model_fn(features, labels, mode): input_layer = tf.reshape(features["x"], [-1, image_x, image_y, 1], name="input") conv1 = tf.layers.conv2d( inputs=input_layer, filters=16, kernel_size=[2, 2], padding="same", activation=tf.nn.relu, name="conv1") print("conv1",conv1.shape) pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2, name="pool1") print("pool1",pool1.shape) conv2 = tf.layers.conv2d( inputs=pool1, filters=32, kernel_size=[5, 5], padding="same", activation=tf.nn.relu, name="conv2") print("conv2",conv2.shape) pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[5, 5], strides=5, name="pool2") print("pool2",pool2.shape) conv3 = tf.layers.conv2d( inputs=pool2, filters=64, kernel_size=[5, 5], padding="same", activation=tf.nn.relu, name="conv3") print("conv3",conv3.shape) # Dense Layer flat = tf.reshape(conv3, [-1, 5*5*64], name="flat") print(flat.shape) dense = tf.layers.dense(inputs=flat, units=128, activation=tf.nn.relu, name="dense") print(dense.shape) dropout = tf.layers.dropout(inputs=dense, rate=0.2, training=mode == tf.estimator.ModeKeys.TRAIN, name="dropout") # Logits Layer num_of_classes = get_num_of_classes() logits = tf.layers.dense(inputs=dropout, units=num_of_classes, name="logits") output_class = tf.argmax(input=logits, axis=1, name="output_class") output_probab = tf.nn.softmax(logits, name="softmax_tensor") predictions = {"classes": tf.argmax(input=logits, axis=1), "probabilities": tf.nn.softmax(logits, name="softmax_tensor")} #tf.Print(tf.nn.softmax(logits, name="softmax_tensor"), [tf.nn.softmax(logits, name="softmax_tensor")]) if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Calculate Loss (for both TRAIN and EVAL modes) onehot_labels = tf.one_hot(indices=tf.cast(labels, tf.int32), depth=num_of_classes) loss = tf.losses.softmax_cross_entropy(onehot_labels=onehot_labels, logits=logits) # Configure the Training Op (for TRAIN mode) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-2) train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step()) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) # Add evaluation metrics (for EVAL mode) eval_metric_ops = {"accuracy": tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])} return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)
def _greedy_decode(input_embeddings, output_vocab_size, target_end_id, target_start_id, output_vocab_embeddings_table, source_len, model_config, mode, input_copy_mask=None, clean_output_mask=None): """Fast decoding.""" encoder_output = common_layers.linear_transform( input_embeddings, output_size=model_config.model_parameters.encoder_dims, scope="bert_to_transformer") decode_length = model_config.data_options.max_decode_length # Expand the inputs in to the beam width. def symbols_to_logits_fn(logit_indices, current_index): """Go from targets to logits.""" logit_indices = tf.expand_dims(logit_indices, 0) decode_steps = decode_utils.get_decode_steps(logit_indices, output_vocab_size, model_config) target_embeddings = _get_target_embeddings( input_embeddings, output_vocab_embeddings_table, decode_steps, model_config) decoder_output = _build_transformer_decoder( encoder_output, source_len, target_embeddings, mode, model_config, single_step_index=current_index) logits = _get_action_logits(encoder_output, decoder_output, output_vocab_embeddings_table, output_vocab_size, model_config, input_copy_mask=input_copy_mask, clean_output_mask=clean_output_mask) # Squeeze batch dimension and length dimension, as both should be 1. logits = tf.squeeze(logits, axis=[0, 1]) # Shape of logits should now be (output_vocab_size). return logits def loop_cond(i, decoded_ids, unused_logprobs): """Loop conditional that returns false to stop loop.""" return tf.logical_and( tf.reduce_all(tf.not_equal(decoded_ids, target_end_id)), tf.less(i, decode_length)) def inner_loop(i, decoded_ids, logprobs): """Decoder function invoked on each while loop iteration.""" logits = symbols_to_logits_fn(decoded_ids, i) next_id = tf.argmax(logits, axis=0) softmax = tf.nn.softmax(logits) extended_vocab_size = tf.shape(softmax)[-1] mask = tf.one_hot(next_id, extended_vocab_size) prob = tf.reduce_sum(softmax * mask) logprob = tf.log(prob) # Add one-hot values to output Tensors, since values at index > i+1 should # still be zero. logprobs += tf.one_hot(i + 1, decode_length + 1, on_value=logprob, dtype=tf.float32) decoded_ids += tf.one_hot(i + 1, decode_length + 1, on_value=next_id, dtype=tf.int64) return i + 1, decoded_ids, logprobs initial_ids = tf.zeros(dtype=tf.int64, shape=[decode_length + 1]) initial_ids += tf.one_hot(0, decode_length + 1, on_value=tf.cast(target_start_id, tf.int64)) initial_logprob = tf.zeros(dtype=tf.float32, shape=[decode_length + 1]) initial_i = tf.constant(0) initial_values = [initial_i, initial_ids, initial_logprob] _, decoded_ids, logprobs = tf.while_loop(loop_cond, inner_loop, initial_values) # Remove <START> symbol. decoded_ids = decoded_ids[1:] logprobs = logprobs[1:] # Sum logprobs to get scores for overall sequence. logprobs = tf.reduce_sum(logprobs, axis=0) # Expand decoded_ids and logprobs to reflect beam width dimension of 1. decoded_ids = tf.expand_dims(decoded_ids, 0) logprobs = tf.expand_dims(logprobs, 0) # This is the output dict that the function returns. output_decode_steps = decode_utils.get_decode_steps( decoded_ids, output_vocab_size, model_config) predictions = decode_utils.get_predictions(output_decode_steps) predictions[constants.SCORES_KEY] = logprobs return predictions
def _get_masked_lm_output(self, inputs: pretrain_data.Inputs, model): """Masked language modeling softmax layer.""" masked_lm_weights = inputs.masked_lm_weights with tf.variable_scope("generator_predictions"): if self._config.uniform_generator or self._config.identity_generator or self._config.heuristic_generator: logits = tf.zeros(self._bert_config.vocab_size) logits_tiled = tf.zeros( modeling.get_shape_list(inputs.masked_lm_ids) + [self._bert_config.vocab_size]) logits_tiled += tf.reshape(logits, [1, 1, self._bert_config.vocab_size]) logits = logits_tiled else: relevant_hidden = pretrain_helpers.gather_positions( model.get_sequence_output(), inputs.masked_lm_positions) hidden = tf.layers.dense( relevant_hidden, units=modeling.get_shape_list(model.get_embedding_table())[-1], activation=modeling.get_activation(self._bert_config.hidden_act), kernel_initializer=modeling.create_initializer( self._bert_config.initializer_range)) hidden = modeling.layer_norm(hidden) output_bias = tf.get_variable( "output_bias", shape=[self._bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(hidden, model.get_embedding_table(), transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) oh_labels = tf.one_hot( inputs.masked_lm_ids, depth=self._bert_config.vocab_size, dtype=tf.float32) probs = tf.nn.softmax(logits) if self._config.identity_generator: identity_logits = tf.zeros(self._bert_config.vocab_size) identity_logits_tiled = tf.zeros( modeling.get_shape_list(inputs.masked_lm_ids) + [self._bert_config.vocab_size]) masked_identity_weights = tf.one_hot(inputs.masked_lm_ids, depth=self._bert_config.vocab_size, dtype=tf.float32) identity_logits_tiled += 25.0 * masked_identity_weights identity_logits_tiled += tf.reshape(identity_logits, [1, 1, self._bert_config.vocab_size]) identity_logits = identity_logits_tiled identity_probs = tf.nn.softmax(identity_logits) identity_weight = (self.global_step / tf.cast(self._config.num_train_steps, tf.float32)) * self._config.max_identity_weight probs = probs * (1 - identity_weight) + identity_probs * identity_weight logits = tf.math.log(probs) # softmax(log(probs)) = probs elif self._config.heuristic_generator: synonym_logits = tf.zeros(self._bert_config.vocab_size) synonym_logits_tiled = tf.zeros( modeling.get_shape_list(inputs.masked_lm_ids) + [self._bert_config.vocab_size]) masked_synonym_weights = tf.reduce_sum( tf.one_hot(inputs.masked_synonym_ids, depth=self._bert_config.vocab_size, dtype=tf.float32), -2) padded_synonym_mask = tf.concat([tf.zeros([1]), tf.ones([self._bert_config.vocab_size - 1])], 0) masked_synonym_weights *= tf.expand_dims(tf.expand_dims(padded_synonym_mask, 0), 0) synonym_logits_tiled += 25.0 * masked_synonym_weights synonym_logits_tiled += tf.reshape(synonym_logits, [1, 1, self._bert_config.vocab_size]) synonym_logits = synonym_logits_tiled synonym_probs = tf.nn.softmax(synonym_logits) if self._config.synonym_scheduler_type == 'linear': synonym_weight = (self.global_step / tf.cast(self._config.num_train_steps, tf.float32)) * self._config.max_synonym_weight probs = probs * (1 - synonym_weight) + synonym_probs * synonym_weight logits = tf.math.log(probs) # softmax(log(probs)) = probs log_probs = tf.nn.log_softmax(logits) label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1) numerator = tf.reduce_sum(inputs.masked_lm_weights * label_log_probs) denominator = tf.reduce_sum(masked_lm_weights) + 1e-6 loss = numerator / denominator preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32) MLMOutput = collections.namedtuple( "MLMOutput", ["logits", "probs", "loss", "per_example_loss", "preds"]) return MLMOutput( logits=logits, probs=probs, per_example_loss=label_log_probs, loss=loss, preds=preds)
def build_model(self, cate_list): """モデルの構築""" # 変数の定義 # 商品の埋め込み表現が保存される行列 [|I|, di] # 2次元のルックアップテーブル item_emb_w = tf.get_variable( "item_emb_w", [self.config['item_count'], self.config['itemid_embedding_size']]) # 類似度のバイアスのベクトル [|I|] item_b = tf.get_variable("item_b", [ self.config['item_count'], ], initializer=tf.constant_initializer(0.0)) # カテゴリの埋め込み表現が保存される行列 [|A|, da] # 2次元のルックアップテーブル cate_emb_w = tf.get_variable( "cate_emb_w", [self.config['cate_count'], self.config['cateid_embedding_size']]) # 各商品のIDとカテゴリIDのマップ(リスト) [|I|] cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int32) # アイテム埋め込みとカテゴリ埋め込みと時間の埋め込みを結合、それをDenseで写像する # 論文:p3左のu_ij=h_emb # 予測すべきアイテムの埋め込み表現 [B, di+da] i_emb = tf.concat([ tf.nn.embedding_lookup(item_emb_w, self.i), tf.nn.embedding_lookup(cate_emb_w, tf.gather(cate_list, self.i)), ], 1) # 予測すべきアイテムの重み [B] i_b = tf.gather(item_b, self.i) # 入力する各履歴の埋め込み表現 [B, T, di+da] # embedding_lookupでルックアップテーブルから該当する埋め込み表現を持ってくる h_emb = tf.concat([ tf.nn.embedding_lookup(item_emb_w, self.hist_i), tf.nn.embedding_lookup(cate_emb_w, tf.gather( cate_list, self.hist_i)), self.im, self.r, ], 2) if self.config['concat_time_emb'] == True: # 時間の埋め込み表現を結合 [B, T, di+da+dt] t_emb = tf.one_hot(self.hist_t, 12, dtype=tf.float32) h_emb = tf.concat([h_emb, t_emb], -1) h_emb = tf.layers.dense(h_emb, self.config['hidden_units']) else: # 時間の埋め込み表現をPE [B, T, di+da] t_emb = tf.layers.dense(tf.expand_dims(self.hist_t, -1), self.config['hidden_units'], activation=tf.nn.tanh) h_emb += t_emb # アテンション機構を重ねる数 num_blocks = self.config['num_blocks'] num_heads = self.config['num_heads'] dropout_rate = self.config['dropout'] # QKVをDenseで写像した後のサイズ C = di+da+dt or di+da num_units = h_emb.get_shape().as_list()[-1] # トランスフォーマー # 論文:p4左数式(3) # u_emb [B, C] u_emb, self.att, self.stt = attention_net( # uij h_emb, # ユーザーの履歴の長さ self.sl, # デコーダーへの入力 i_emb, num_units, num_heads, num_blocks, dropout_rate, self.is_training, False) # 予測 # 論文:p4右数式(7)&(8) f(h_t, et_u) reduce_sum([B, C]) [B] self.logits = i_b + tf.reduce_sum(tf.multiply(u_emb, i_emb), 1) # ============== Eval =============== self.eval_logits = self.logits # Step variable self.global_step = tf.Variable(0, trainable=False, name='global_step') self.global_epoch_step = \ tf.Variable(0, trainable=False, name='global_epoch_step') self.global_epoch_step_op = \ tf.assign(self.global_epoch_step, self.global_epoch_step+1) # Loss # L2正規化 l2_norm = tf.add_n([ tf.nn.l2_loss(u_emb), tf.nn.l2_loss(i_emb), ]) # ロス定義、ペアワイズ、シグモイド相互情報量 self.loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.logits, labels=self.y)) + self.config['regulation_rate'] * l2_norm self.train_summary = tf.summary.merge([ tf.summary.histogram('embedding/1_item_emb', item_emb_w), tf.summary.histogram('embedding/2_cate_emb', cate_emb_w), tf.summary.histogram('embedding/3_time_raw', self.hist_t), tf.summary.histogram('embedding/3_time_dense', t_emb), tf.summary.histogram('embedding/4_final', h_emb), tf.summary.histogram('attention_output', u_emb), tf.summary.scalar('L2_norm Loss', l2_norm), tf.summary.scalar('Training Loss', self.loss), ])
def dot_product_mpnn_attention(q, k, v, adjacency_matrix, num_edge_types, num_transforms=None, use_weighted_sum=False, name=None): """Dot product attention with edge vectors. Let B be the number of batches. Let N be the number of nodes in the graph. Let K be the size of the attention keys/queries. Let V be the size of the attention values. Let T be the total number of transforms (num_transforms). Args: q: The query Tensor of shape [B, N, K]. k: The key Tensor of shape [B, T, N, K]. v: The value Tensor of shape [B, T, N, V]. adjacency_matrix: A Tensor of shape [B, N, N, T]. An entry at indices b, i, j, k is the indicator of the edge from node j to node i in batch b. A standard adjacency matrix will only have one edge type while a mutigraph will have multiple edge types. num_edge_types: An integer specifying number of edge types. num_transforms: An integer indicating number of transforms (T). If None, then num_transforms will be equal to num_edge_types. use_weighted_sum: If False, will only use a single transform per edge type. Otherwise, use a learned weighted sum of transforms per edge type. name: A string. Returns: A Tensor of shape [B, N, V] storing the result of computing attention weights using the queries and keys and combining the values according to those weights. Raises: ValueError: if num_transforms doesn't equal num_edge_types and not using weighted sum. """ with tf.variable_scope(name, default_name="dot_product_mpnn_attention", values=[q, k, v, adjacency_matrix, num_edge_types]): # If not explicitly set, use num_transforms set to num_edge_types. num_transforms = (num_edge_types if num_transforms is None else num_transforms) if not use_weighted_sum and num_transforms != num_edge_types: raise ValueError("num_transforms must equal num_edge_types unless " "use_weighted_sum is True") # Computes the raw dot-product attention values between each query and # the corresponding keys it needs to consider. # # This operation takes the dot product of (the query for # each node) and (the key for each node for each possible edge type), # creating an N x N matrix for each edge type. The entry at index (i, j) # is the dot-product for the edge from node i to node j of the appropriate # type. These dot products will eventually become attention weights # specifying how much node i weights an edge of that type coming from node # j. all_edge_logits = tf.matmul(tf.tile(tf.expand_dims(q, axis=1), [1, num_edge_types, 1, 1]), k, transpose_b=True) # The adjacency matrix assumes there is only one directed edge (i <- j) for # each pair of nodes. If such an edge exists, it contains the integer # type of that edge at position (i, j) of the adjacency matrix. # # Construct edge_vectors of shape [B, N, N, T]. if use_weighted_sum: # Use dense representation for edge vectors. edge_vectors = make_edge_vectors(adjacency_matrix, num_edge_types, num_transforms) else: # Generate one-hot vectors based on edge types. # If there is an edge from node j to node i of type t, then index t of the # last dimension is 1 for entry (i, j) of the second and third dimensions. edge_vectors = tf.one_hot(adjacency_matrix, num_transforms) # Rearranging the dimensions to match the shape of all_edge_logits. edge_vectors = tf.transpose(edge_vectors, [0, 3, 1, 2]) # Element-wise multiplies all_edge_logits and edge_vectors. # # In other words: all_edge_logits contains N x N matrices of query-key # products. This element-wise multiplication zeroes out entries that do not # correspond to actual edges in the graph of the appropriate edge type. # all_edge_logits retains shape [B, T, N, N]. all_edge_logits *= edge_vectors # Since there can only be one edge from node A to node B, we can collapse # the T different adjacency matrices containing key-query pairs into one # adjacency matrix. logits is [B, N, N]. # TODO(dbieber): Use a reshape instead of reduce sum to attend over all # edges instead of over all neighboring nodes to handle the multigraph case. logits = tf.reduce_sum(all_edge_logits, axis=1) # For pairs of nodes with no edges between them, add a large negative bias # to each location without an edge so that the softmax of entries with the # value 0 become a small negative number instead. bias = 0 bias = tf.to_float( tf.equal(tf.reduce_sum(adjacency_matrix, axis=-1), 0)) * -1e9 logits += bias # Turn the raw key-query products into a probability distribution (or, # in terms of attention, weights). The softmax is computed across the # last dimension of logits. compatibility = tf.nn.softmax(logits) # Shape [B, N, N]. # Computes a summary showing the attention matrix as an image. Does not do # any work toward actually performing attention. common_attention.attention_image_summary( tf.expand_dims(compatibility, axis=1), None) # Repeats the attention matrix T times for each batch, producing # a tensor with shape [B, T, N, N] where the [N, N] component is T # repeats of the values found in compatibility. edge_compatibility = tf.tile(tf.expand_dims(compatibility, axis=1), [1, num_edge_types, 1, 1]) # Zeroes out the entries in edge_compatibility that do not correspond to # actual edges. edge_compatibility *= edge_vectors # Shape [B, T, N, N]. output = compute_values(edge_compatibility, v) return output
def batch_loss(model, batch): predicted_y = tf.nn.softmax(tf.matmul(batch.x, model.weights) + model.bias) return -tf.reduce_mean( tf.reduce_sum(tf.one_hot(batch.y, 10) * tf.log(predicted_y), axis=[1]))
def gating_internel(self, inputs, total_token_num): logits = tf.einsum('GSM,ME->GSE', inputs, self.gating_weight) # G'SE raw_gates = tf.nn.softmax(logits) # along E dim, G'SE tf.logging.info("raw_gates:{}".format(raw_gates)) while self.expert_capacity_dim % 4: self.expert_capacity_dim += 1 tf.logging.info( 'Setting expert_capacity_dim=%r (' 'num_experts=%r name_scope=%r)', self.expert_capacity_dim, self.num_experts, tf.get_default_graph().get_name_scope()) # First top gate idx and gate val top_gate_index_1 = tf.math.argmax(raw_gates, axis=-1, output_type=tf.int32) # G'S #tf.summary.tensor_summary('top_gate_index_1', top_gate_index_1) mask_1 = tf.one_hot(top_gate_index_1, self.num_experts, dtype=tffloat) # G'SE density_1_proxy = raw_gates importance = tf.ones_like(mask_1[:, :, 0]) gate_1 = tf.einsum('GSE,GSE->GS', raw_gates, mask_1) # G'S # Second top gate idx and gate val gates_without_top_1 = raw_gates * (1.0 - mask_1) top_gate_index_2 = tf.math.argmax(gates_without_top_1, axis=-1, output_type=tf.int32) # G'S #tf.summary.tensor_summary('top_gate_index_2', top_gate_index_2) mask_2 = tf.one_hot(top_gate_index_2, self.num_experts, dtype=tffloat) # G'SE gate_2 = tf.einsum('GSE,GSE->GS', gates_without_top_1, mask_2) # G'S # We reshape the mask as [X*S, E], and compute cumulative sums of # assignment indicators for each expert index e \in 0..E-1 independently. # First occurrence of assignment indicator is excluded, see exclusive=True # flag below. position_in_expert_1 = tf.cumsum(mask_1, exclusive=True, axis=1) # GS Tensor capacity = tf.cast(self.expert_capacity_dim, dtype=position_in_expert_1.dtype) # GE Tensor (reducing S out of GSE tensor mask_1) # density_1[:, e] represents assignment ratio (num assigned / total) to # expert e as top_1 expert without taking capacity into account. density_denom = tf.reduce_mean(importance, axis=(1))[:, tf.newaxis] + 1e-6 density_1 = tf.reduce_mean(mask_1, axis=(1)) / density_denom # density_1_proxy[:, e] represents mean of raw_gates for expert e, including # those of examples not assigned to e with top_k. density_1_proxy = tf.reduce_mean(density_1_proxy, axis=1) / density_denom with tf.name_scope('aux_loss'): # The MoE paper (https://arxiv.org/pdf/1701.06538.pdf) uses an aux loss of # reduce_mean(density_1_proxy * density_1_proxy). Here we replace one of # the density_1_proxy with the discrete density_1 following mesh_tensorflow. aux_loss = tf.reduce_mean(density_1_proxy * density_1) # element-wise aux_loss *= self.num_experts * self.num_experts # const coefficient mask_1 *= tf.cast(tf.less(position_in_expert_1, capacity), dtype=mask_1.dtype) position_in_expert_1 = tf.einsum('GSE,GSE->GS', position_in_expert_1, mask_1) # How many examples in this sequence go to this expert mask_1_count = tf.einsum('GSE->GE', mask_1) # [batch, group] - mostly ones, but zeros where something didn't fit mask_1_flat = tf.einsum('GSE->GS', mask_1) if self.second_expert_policy == 'all': pass elif self.second_expert_policy == 'random': # gate_2 is between 0 and 1, reminder: # # raw_gates = tf.nn.softmax(logits) # index_1 = tf.math.argmax(raw_gates, axis=-1, output_type=tf.int32) # mask_1 = tf.one_hot(index_1, num_experts, dtype=tffloat) # gate_1 = tf.einsum('GSE,GSE->GS', raw_gates, mask_1) # # E.g. if gate_2 exceeds second_expert_threshold, then we definitely # dispatch to second-best expert. Otherwise we dispatch with probability # proportional to (gate_2 / threshold). # sampled_2 = tf.less( tf.random.uniform(gate_2.shape, dtype=gate_2.dtype), (gate_2 / max(self.second_expert_threshold, 1e-9))) gate_2 *= tf.cast(sampled_2, gate_2.dtype) mask_2 *= tf.cast(tf.expand_dims(sampled_2, -1), mask_2.dtype) else: raise ValueError(self.second_expert_policy) # Sum token count of first and second top gate. position_in_expert_2 = tf.cumsum( mask_2, exclusive=True, axis=1) + tf.expand_dims(mask_1_count, 1) mask_2 *= tf.cast(tf.less(position_in_expert_2, capacity), mask_2.dtype) position_in_expert_2 = tf.einsum('GSE,GSE->GS', position_in_expert_2, mask_2) mask_2_flat = tf.reduce_sum(mask_2, axis=-1) gate_1 *= mask_1_flat gate_2 *= mask_2_flat # Normalize top-k gates. denom = gate_1 + gate_2 # To avoid divide by 0. denom = tf.where(denom > 0, denom, tf.ones_like(denom)) gate_1 /= denom gate_2 /= denom # First top gate as first part of combine tensor b = tf.one_hot(tf.cast(position_in_expert_1, dtype=tf.int32), self.expert_capacity_dim, dtype=tffloat, name='one_hot_b_0') # G'SE a = tf.expand_dims(gate_1 * mask_1_flat, -1) * tf.one_hot( top_gate_index_1, self.num_experts, dtype=tffloat) # G'SE first_part_of_combine_tensor = tf.einsum( 'GSE,GSC->GSEC', a, b, name='first_part_of_combine_tensor') # G'SEC # Second top gate as first part of combine tensor b = tf.one_hot(tf.cast(position_in_expert_2, dtype=tf.int32), self.expert_capacity_dim, dtype=tffloat, name='one_hot_b_1') # G'SE a = tf.expand_dims(gate_2 * mask_2_flat, -1) * tf.one_hot( top_gate_index_2, self.num_experts, dtype=tffloat) # G'SE second_part_of_combine_tensor = tf.einsum( 'GSE,GSC->GSEC', a, b, name='second_part_of_combine_tensor') # G'SEC # Combine tensors of two parts. combine_tensor = tf.math.add(first_part_of_combine_tensor, second_part_of_combine_tensor, name='combine_tensor') # G'SEC dispatch_mask = tf.cast(tf.cast(combine_tensor, tf.bool), tffloat, name='dispatch_mask') # G'SEC return aux_loss, combine_tensor, dispatch_mask
def skew_elements_right(tensor: tf.Tensor, axis: int, pad_value=0, name: Optional[Text] = None) -> tf.Tensor: """Skews successive elements right along the given `axis`. This changes an input like [ [1, 2, 3], [4, 5, 6], [7, 8, 9] ] into the following: [ [1, 2, 3, 0, 0], [0, 4, 5, 6, 0], [0, 0, 7, 8, 9] ] Args: tensor: Tensor of shape [..., num_rows, axis_len, ...]. axis: A valid axis in `tensor` to skew along. It must not be the first axis in `tensor`. pad_value: The scalar pad value to use. Defaults to 0. Must be the same type as `tensor`. name: A name for the operation (optional). Returns: Tensor of shape [..., num_rows, axis_len + num_rows - 1, ...]. """ with tf.name_scope(name or 'skew_elements_right'): tensor = tf.convert_to_tensor(tensor) rank = tensor.shape.rank num_rows = get_shape_list(tensor)[axis - 1] axis_len = get_shape_list(tensor)[axis] if rank is None: raise ValueError('Static rank of `tensor` must be known.') if axis < 0: axis += rank if axis <= 0 or axis >= rank: raise ValueError('`axis` out of bounds for `tensor` rank.') output_len = axis_len + num_rows - 1 paddings = num_rows * tf.one_hot([-1, axis], rank, axis=0, dtype=tf.int32) # [..., num_rows, axis_len + num_rows, ...] padded_tensor = tf.pad(tensor, paddings, constant_values=pad_value) # [..., num_rows * (axis_len + num_rows), ...] flat_tensor = flatten_dims(padded_tensor, first_dim=axis - 1, last_dim=axis) padded_tensor2 = pad_to_multiple( flat_tensor, factor=output_len, axis=axis - 1, constant_values=pad_value) # [..., num_rows + 1, output_len, ...] new_shape = tf.concat([ tf.shape(tensor)[:(axis - 1)], [num_rows + 1, output_len], tf.shape(tensor)[(axis + 1):] ], 0) reshaped_tensor = tf.reshape(padded_tensor2, new_shape) # [..., num_rows, output_len, ...] output_shape = new_shape - tf.one_hot(axis - 1, depth=rank, dtype=tf.int32) return tf.slice( reshaped_tensor, begin=tf.zeros_like(output_shape), size=output_shape)
def gating_internel(self, inputs, total_token_num): if self.is_training: policy = self.switch_policy_train capacity_factor = self.capacity_factor_train else: policy = self.switch_policy_eval capacity_factor = self.capacity_factor_eval if not self.expert_capacity_dim: num_experts = self.num_experts capacity = float(int(total_token_num) / int(num_experts)) * float(capacity_factor) int_capacity = int(capacity) offset = 1 if capacity > float(int_capacity) else 0 self.expert_capacity_dim = int(offset) + int_capacity self.expert_capacity_dim = max(self.expert_capacity_dim, self.min_expert_capacity) tf.logging.info( 'Setting expert_capacity_dim=%r (' 'num_experts=%r name_scope=%r)', self.expert_capacity_dim, self.num_experts, tf.get_default_graph().get_name_scope()) if self.is_training and policy == "input_dropout": inputs = tf.nn.dropout(inputs, 1.0 - self.switch_dropout) logits = tf.einsum('GSM,ME->GSE', inputs, self.gating_weight) # G'SE raw_gates = tf.nn.softmax(logits) # along E dim, G'SE if policy in ["argmax", "input_dropout"]: _, expert_index = tf.math.top_k(raw_gates, k=1) expert_index = tf.squeeze(expert_index, [2]) else: raise ValueError("Unknown Switch gating policy %s" % policy) expert_mask = tf.one_hot(expert_index, self.num_experts, dtype=tffloat) # G'SE density_1_proxy = raw_gates # G'SE importance = tf.ones_like(expert_mask[:, :, 0]) # G'SE gate_1 = tf.einsum('GSE,GSE->GS', raw_gates, expert_mask) # G'S # We reshape the mask as [X*S, E], and compute cumulative sums of # assignment indicators for each expert index e \in 0..E-1 independently. # First occurrence of assignment indicator is excluded, see exclusive=True # flag below. position_in_expert_1 = tf.cumsum(expert_mask, exclusive=True, axis=1) # GS Tensor capacity = tf.cast(self.expert_capacity_dim, dtype=position_in_expert_1.dtype) # GE Tensor (reducing S out of GSE tensor mask_1) # density_1[:, e] represents assignment ratio (num assigned / total) to # expert e as top_1 expert without taking capacity into account. density_denom = tf.reduce_mean(importance, axis=(1))[:, tf.newaxis] + 1e-6 density_1 = tf.reduce_mean(expert_mask, axis=(1)) / density_denom # density_1_proxy[:, e] represents mean of raw_gates for expert e, including # those of examples not assigned to e with top_k. density_1_proxy = tf.reduce_mean(density_1_proxy, axis=1) / density_denom with tf.name_scope('aux_loss'): # The MoE paper (https://arxiv.org/pdf/1701.06538.pdf) uses an aux loss of # reduce_mean(density_1_proxy * density_1_proxy). Here we replace one of # the density_1_proxy with the discrete density_1 following mesh_tensorflow. aux_loss = tf.reduce_mean(density_1_proxy * density_1) # element-wise aux_loss *= self.num_experts * self.num_experts * self.loss_coef # const coefficient expert_mask *= tf.cast(tf.less(position_in_expert_1, capacity), dtype=expert_mask.dtype) position_in_expert_1 = tf.einsum('GSE,GSE->GS', position_in_expert_1, expert_mask) # [batch, group] - mostly ones, but zeros where something didn't fit mask_1_flat = tf.einsum('GSE->GS', expert_mask) gate_1 *= mask_1_flat # First top gate as first part of combine tensor b = tf.one_hot(tf.cast(position_in_expert_1, dtype=tf.int32), self.expert_capacity_dim, dtype=tffloat, name='one_hot_b_0') # G'SE a = tf.expand_dims(gate_1 * mask_1_flat, -1) * tf.one_hot( expert_index, self.num_experts, dtype=tffloat) # G'SE combine_tensor = tf.einsum( 'GSE,GSC->GSEC', a, b, name='first_part_of_combine_tensor') # G'SEC dispatch_mask = tf.cast(tf.cast(combine_tensor, tf.bool), tffloat, name='dispatch_mask') # G'SEC return aux_loss, combine_tensor, dispatch_mask
def compute_label_loss(logits, labels): one_hot_labels = tf.one_hot(labels, depth=5, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_mean( tf.reduce_sum(one_hot_labels * log_probs, axis=-1)) return loss
def test_train(args): """Trains the model.""" if args.verbose: tf.logging.set_verbosity(tf.logging.INFO) # Create input data pipeline. with tf.device("/cpu:0"): train_files = glob.glob(args.train_glob) if not train_files: raise RuntimeError( "No training images found with glob '{}'.".format( args.train_glob)) train_dataset = tf.data.Dataset.from_tensor_slices(train_files) train_dataset = train_dataset.shuffle( buffer_size=len(train_files)).repeat() train_dataset = train_dataset.map( read_png, num_parallel_calls=args.preprocess_threads) train_dataset = train_dataset.map( lambda x: tf.random_crop(x, (args.patchsize, args.patchsize, 3))) train_dataset = train_dataset.batch(args.batchsize) train_dataset = train_dataset.prefetch(32) num_pixels = args.batchsize * args.patchsize**2 # Get training patch from dataset. x = train_dataset.make_one_shot_iterator().get_next() lmbda_level = tf.random_uniform([], minval=0, maxval=64, dtype=tf.int32) lmbda_onehot = tf.one_hot(tf.reshape(lmbda_level, [1]), depth=64) lmbda = 0.1 * tf.pow(2.0, tf.cast(lmbda_level, tf.float32) / 8.0 - 7.0) # Instantiate model. analysis_transform = AnalysisTransform(args.num_filters, lmbda_onehot) synthesis_transform = SynthesisTransform(args.num_filters, lmbda_onehot) hyper_analysis_transform = HyperAnalysisTransform(args.num_filters, lmbda_onehot) hyper_synthesis_transform = HyperSynthesisTransform( args.num_filters, lmbda_onehot) entropy_bottleneck = tfc.EntropyBottleneck() # Build autoencoder and hyperprior. y = analysis_transform(x) z = hyper_analysis_transform(abs(y)) z_tilde, z_likelihoods = entropy_bottleneck(z, training=True) sigma = hyper_synthesis_transform(z_tilde) scale_table = np.exp( np.linspace(np.log(SCALES_MIN), np.log(SCALES_MAX), SCALES_LEVELS)) conditional_bottleneck = tfc.GaussianConditional(sigma, scale_table) y_tilde, y_likelihoods = conditional_bottleneck(y, training=True) x_tilde = synthesis_transform(y_tilde) # Total number of bits divided by number of pixels. train_bpp = (tf.reduce_sum(tf.log(y_likelihoods)) + tf.reduce_sum( tf.log(z_likelihoods))) / (-np.log(2) * num_pixels) # Mean squared error across pixels. train_mse = tf.reduce_mean(tf.squared_difference(x, x_tilde)) # Multiply by 255^2 to correct for rescaling. train_mse *= 255**2 # The rate-distortion cost. train_loss = lmbda * train_mse + train_bpp # Minimize loss and auxiliary loss, and execute update op. step = tf.train.create_global_step() main_optimizer = tf.train.AdamOptimizer(learning_rate=1e-4) main_step = main_optimizer.minimize(train_loss, global_step=step) aux_optimizer = tf.train.AdamOptimizer(learning_rate=1e-3) aux_step = aux_optimizer.minimize(entropy_bottleneck.losses[0]) train_op = tf.group(main_step, aux_step, entropy_bottleneck.updates[0]) tf.summary.scalar("loss", train_loss) tf.summary.scalar("bpp", train_bpp) tf.summary.scalar("mse", train_mse) tf.summary.scalar("lambda", lmbda_level) tf.summary.image("original", quantize_image(x)) tf.summary.image("reconstruction", quantize_image(x_tilde)) hooks = [ tf.train.StopAtStepHook(last_step=args.last_step), tf.train.NanTensorHook(train_loss), ] with tf.train.MonitoredTrainingSession(hooks=hooks, checkpoint_dir=args.checkpoint_dir, save_checkpoint_secs=300, save_summaries_secs=60) as sess: while not sess.should_stop(): sess.run(train_op)
def detection_loss(cls_outputs, box_outputs, labels, params): """Computes total detection loss. Computes total detection loss including box and class loss from all levels. Args: cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. labels: the dictionary that returned from dataloader that includes groundtruth targets. params: the dictionary including training parameters specified in default_haprams function in this file. Returns: total_loss: an integer tensor representing total loss reducing from class and box losses from all levels. cls_loss: an integer tensor representing total class loss. box_loss: an integer tensor representing total box regression loss. """ # Sum all positives in a batch for normalization and avoid zero # num_positives_sum, which would lead to inf loss during training num_positives_sum = tf.reduce_sum(labels['mean_num_positives']) + 1.0 levels = cls_outputs.keys() cls_losses = [] box_losses = [] for level in levels: if params['data_format'] == 'channels_first': labels['cls_targets_%d' % level] = tf.transpose( labels['cls_targets_%d' % level], [0, 3, 1, 2]) labels['box_targets_%d' % level] = tf.transpose( labels['box_targets_%d' % level], [0, 3, 1, 2]) # Onehot encoding for classification labels. cls_targets_at_level = tf.one_hot( labels['cls_targets_%d' % level], params['num_classes']) if params['data_format'] == 'channels_first': bs, _, width, height, _ = cls_targets_at_level.get_shape().as_list() cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, -1, width, height]) else: bs, width, height, _, _ = cls_targets_at_level.get_shape().as_list() cls_targets_at_level = tf.reshape(cls_targets_at_level, [bs, width, height, -1]) box_targets_at_level = labels['box_targets_%d' % level] cls_loss = _classification_loss( cls_outputs[level], cls_targets_at_level, num_positives_sum, alpha=params['alpha'], gamma=params['gamma']) if params['data_format'] == 'channels_first': cls_loss = tf.reshape(cls_loss, [bs, -1, width, height, params['num_classes']]) else: cls_loss = tf.reshape(cls_loss, [bs, width, height, -1, params['num_classes']]) cls_loss *= tf.cast(tf.expand_dims( tf.not_equal(labels['cls_targets_%d' % level], -2), -1), tf.float32) cls_losses.append(tf.reduce_sum(cls_loss)) box_losses.append( _box_loss( box_outputs[level], box_targets_at_level, num_positives_sum, delta=params['delta'])) # Sum per level losses to total loss. cls_loss = tf.add_n(cls_losses) box_loss = tf.add_n(box_losses) total_loss = cls_loss + params['box_loss_weight'] * box_loss return total_loss, cls_loss, box_loss
def test_compress(args): """Compresses an image.""" # Load input image and add batch dimension. fn = tf.placeholder(tf.string, []) x = read_png(fn) x = tf.expand_dims(x, 0) x.set_shape([1, None, None, 3]) x_shape = tf.shape(x) lmbda_level = tf.random_uniform([], minval=0, maxval=64, dtype=tf.int32) lmbda_onehot = tf.one_hot(tf.reshape(lmbda_level, [1]), depth=64) lmbda = 0.1 * tf.pow(2.0, tf.cast(lmbda_level, tf.float32) / 8.0 - 7.0) # Instantiate model. analysis_transform = AnalysisTransform(args.num_filters, lmbda_onehot) synthesis_transform = SynthesisTransform(args.num_filters, lmbda_onehot) hyper_analysis_transform = HyperAnalysisTransform(args.num_filters, lmbda_onehot) hyper_synthesis_transform = HyperSynthesisTransform( args.num_filters, lmbda_onehot) entropy_bottleneck = tfc.EntropyBottleneck() # Transform and compress the image. y = analysis_transform(x) y_shape = tf.shape(y) z = hyper_analysis_transform(abs(y)) z_hat, z_likelihoods = entropy_bottleneck(z, training=False) sigma = hyper_synthesis_transform(z_hat) sigma = sigma[:, :y_shape[1], :y_shape[2], :] scale_table = np.exp( np.linspace(np.log(SCALES_MIN), np.log(SCALES_MAX), SCALES_LEVELS)) conditional_bottleneck = tfc.GaussianConditional(sigma, scale_table) side_string = entropy_bottleneck.compress(z) string = conditional_bottleneck.compress(y) # Transform the quantized image back (if requested). y_hat, y_likelihoods = conditional_bottleneck(y, training=False) x_hat = synthesis_transform(y_hat) x_hat = x_hat[:, :x_shape[1], :x_shape[2], :] num_pixels = tf.cast(tf.reduce_prod(tf.shape(x)[:-1]), dtype=tf.float32) # Total number of bits divided by number of pixels. eval_bpp = (tf.reduce_sum(tf.log(y_likelihoods)) + tf.reduce_sum( tf.log(z_likelihoods))) / (-np.log(2) * num_pixels) # Bring both images back to 0..255 range. x *= 255 x_hat = tf.clip_by_value(x_hat, 0, 1) x_hat = tf.round(x_hat * 255) mse = tf.reduce_mean(tf.squared_difference(x, x_hat)) psnr = tf.squeeze(tf.image.psnr(x_hat, x, 255)) msssim = tf.squeeze(tf.image.ssim_multiscale(x_hat, x, 255)) with tf.Session() as sess: # Load the latest model checkpoint, get the compressed string and the tensor # shapes. latest = tf.train.latest_checkpoint(checkpoint_dir=args.checkpoint_dir) tf.train.Saver().restore(sess, save_path=latest) f = open("f6.csv", "w") print("level, fn, bpp, mse, np", file=f) for i in np.arange(0, 64): for filename in glob.glob("kodak/*.png"): v_lmbda_level, v_eval_bpp, v_mse, v_num_pixels = sess.run( [lmbda_level, eval_bpp, mse, num_pixels], feed_dict={ fn: filename, lmbda_level: i }) print( "%.2f, %s, %.4f, %.4f, %d" % (v_lmbda_level, filename, v_eval_bpp, v_mse, v_num_pixels), file=f) f.close()
def onehot_labels(self): return tf.one_hot(self.labels, NUM_CLASSES)
def __init__(self, env, q_func, optimizer_spec, session, exploration=LinearSchedule(1000000, 0.1), stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, rew_file=None, lander=False): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: img_in: tf.Tensor tensorflow tensor representing the input image num_actions: int number of actions scope: str scope in which all the model related variables should be created reuse: bool whether previously created variables should be reused. optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer session: tf.Session tensorflow session to use. exploration: rl_algs.deepq.utils.schedules.Schedule schedule for probability of chosing random action. stopping_criterion: (env, t) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network grad_norm_clipping: float or None If not None gradients' norms are clipped to this value. """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete self.target_update_freq = target_update_freq self.optimizer_spec = optimizer_spec self.batch_size = batch_size self.learning_freq = learning_freq self.learning_starts = learning_starts self.stopping_criterion = stopping_criterion self.env = env self.session = session self.exploration = exploration self.rew_file = str( uuid.uuid4()) + '.pkl' if rew_file is None else rew_file ############### # BUILD MODEL # ############### if len(self.env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_shape = self.env.observation_space.shape else: img_h, img_w, img_c = self.env.observation_space.shape input_shape = (img_h, img_w, frame_history_len * img_c) self.num_actions = self.env.action_space.n # set up placeholders # placeholder for current observation (or state) self.obs_t_ph = tf.placeholder(tf.float32 if lander else tf.uint8, [None] + list(input_shape)) # placeholder for current action self.act_t_ph = tf.placeholder(tf.int32, [None]) # placeholder for current reward self.rew_t_ph = tf.placeholder(tf.float32, [None]) # placeholder for next observation (or state) self.obs_tp1_ph = tf.placeholder(tf.float32 if lander else tf.uint8, [None] + list(input_shape)) # placeholder for end of episode mask # this value is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target, not the # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) self.done_mask_ph = tf.placeholder(tf.float32, [None]) # casting to float on GPU ensures lower data transfer times. if lander: obs_t_float = self.obs_t_ph obs_tp1_float = self.obs_tp1_ph else: obs_t_float = tf.cast(self.obs_t_ph, tf.float32) / 255.0 obs_tp1_float = tf.cast(self.obs_tp1_ph, tf.float32) / 255.0 # Here, you should fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # TensorFlow will differentiate this error for you, you just need to pass it to the # optimizer. See assignment text for details. # Your code should produce one scalar-valued tensor: total_error # This will be passed to the optimizer in the provided code below. # Your code should also produce two collections of variables: # q_func_vars # target_q_func_vars # These should hold all of the variables of the Q-function network and target network, # respectively. A convenient way to get these is to make use of TF's "scope" feature. # For example, you can create your Q-function network with the scope "q_func" like this: # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) # And then you can obtain the variables like this: # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" # Tip: use huber_loss (from dqn_utils) instead of squared error when defining self.total_error ###### # YOUR CODE HERE self.q_t = q_func(obs_t_float, self.num_actions, scope="q_func", reuse=False) self.q_tp1 = q_func(obs_tp1_float, self.num_actions, scope="target_q_func", reuse=False) # get Q-value based on the chosen action max_q_t = tf.reduce_sum(self.q_t * tf.one_hot(self.act_t_ph, self.num_actions), axis=1) #max_action = tf.argmax(self.q_t, axis=1) target max q. max_q = tf.reduce_max(self.q_tp1, axis=1) target = self.rew_t_ph + gamma * (1.0 - self.done_mask_ph) * max_q self.total_error = tf.reduce_mean(huber_loss(target - max_q_t)) q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="q_func") target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target_q_func") # construct optimization op (with gradient clipping) self.learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") optimizer = self.optimizer_spec.constructor( learning_rate=self.learning_rate, **self.optimizer_spec.kwargs) self.train_fn = minimize_and_clip(optimizer, self.total_error, var_list=q_func_vars, clip_val=grad_norm_clipping) # update_target_fn will be called periodically to copy Q network to target Q network update_target_fn = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_fn.append(var_target.assign(var)) self.update_target_fn = tf.group(*update_target_fn) # construct the replay buffer self.replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len, lander=lander) self.replay_buffer_idx = None ############### # RUN ENV # ############### self.model_initialized = False self.num_param_updates = 0 self.mean_episode_reward = -float('nan') self.best_mean_episode_reward = -float('inf') self.last_obs = self.env.reset() self.log_every_n_steps = 10000 self.start_time = time.time() self.t = 0
def lstm_decoder_infer(self, inputs, sequence_length, hparams, clss, train, initial_state=None, bottleneck=None): # IN PREDICT MODE, RUN tf.while RNN max_decode_length = 51 batch_size = common_layers.shape_list(inputs)[0] zero_pad, logits_so_far = self.create_initial_input_for_decode( batch_size) layers = rnn.MultiRNNCell([ self.lstm_cell(hparams, train) for _ in range(hparams.num_hidden_layers) ]) if initial_state is None: raise Exception('initial state should be init from bottleneck!') # append one-hot class to bottleneck, which will be given per step clss = tf.reshape(clss, [-1]) if not hparams.use_cls: clss = tf.zeros_like(clss) if hparams.condition_on_sln: sln = tf.reshape(sequence_length, [-1]) bottleneck = tf.concat( (bottleneck, tf.one_hot(clss, hparams.num_categories), tf.one_hot(sln, max_decode_length)), -1) else: bottleneck = tf.concat( (bottleneck, tf.one_hot(clss, hparams.num_categories)), -1) def infer_step(logits_so_far, current_hidden): """Inference step of LSTM while loop.""" # unflatten hidden: current_hidden = tuple( rnn.LSTMStateTuple(c=s[0], h=s[1]) for s in current_hidden) # put logits_so_far through top tm = self._problem_hparams.modality['targets'] # need to reuse top params reset_scope = tf.variable_scope(tf.VariableScope( tf.AUTO_REUSE, ''), reuse=tf.AUTO_REUSE, auxiliary_name_scope=False) top_scope = tf.variable_scope('svg_decoder/{}_modality'.format(tm), reuse=tf.AUTO_REUSE) with reset_scope, top_scope: samples_so_far = self.hparams.top['targets']( logits_so_far, None, self.hparams, self.problem_hparams.vocab_size) # append a zero pad to the samples. this effectively shifts the samples # right, but, unlike shift_right, by not removing the last element, we # allow an empty samples_so_far to not be empty after padding samples_so_far = tf.concat([zero_pad, samples_so_far], axis=1) shifted_targets = common_layers.flatten4d3d(samples_so_far) # now take the very last one here, will be the actual input to the rnn shifted_targets = shifted_targets[:, -1:, :] # tile and append the bottleneck to inputs sln_offset = 0 if hparams.condition_on_sln: sln_offset = 51 pre_tile_y = tf.reshape(bottleneck, [ common_layers.shape_list(bottleneck)[0], 1, hparams.bottleneck_bits + hparams.num_categories + sln_offset ]) overlay_x = tf.tile( pre_tile_y, [1, common_layers.shape_list(shifted_targets)[1], 1]) inputs = tf.concat([shifted_targets, overlay_x], -1) seq_len_batch = tf.ones([common_layers.shape_list(inputs)[0]]) # RUN PRE-LSTM LAYER with tf.variable_scope('pre_decoder', reuse=tf.AUTO_REUSE): inputs = tf.layers.dense(inputs, hparams.hidden_size, name='bottom') inputs = tf.nn.tanh(inputs) # RUN LSTM with tf.variable_scope('lstm_decoder', reuse=tf.AUTO_REUSE): next_step, next_state = tf.nn.dynamic_rnn( layers, inputs, seq_len_batch, initial_state=current_hidden, dtype=tf.float32, time_major=False) next_step = tf.expand_dims(next_step, [1]) logits_so_far = tf.concat([logits_so_far, next_step], 1) # flatten state next_state = tuple((s.c, s.h) for s in next_state) return logits_so_far, next_state def while_exit_cond(logits_so_far, unused_current_hidden): length = common_layers.shape_list(logits_so_far)[1] return length < max_decode_length # passing state must be flattened: initial_state = tuple((s.c, s.h) for s in initial_state) # actually run tf.while: logits, final_state = tf.while_loop( while_exit_cond, infer_step, [logits_so_far, initial_state], shape_invariants=[ tf.TensorShape([None, None, 1, hparams.hidden_size]), tuple((s[0].get_shape(), s[1].get_shape()) for s in initial_state), ], back_prop=False, parallel_iterations=1) # logits should be returned in 3d mode: logits = common_layers.flatten4d3d(logits) return logits, final_state
import tensorflow.compat.v1 as tf import numpy as np path = 'https://raw.githubusercontent.com/hunkim/DeepLearningZeroToAll/master/data-04-zoo.csv' xy = np.genfromtxt(path, delimiter=',', dtype=np.float32) x_data = xy[:, 0:-1] y_data = xy[:, [-1]] nb_classes = 7 # 0 ~ 6 X = tf.placeholder(tf.float32, [None, 16]) Y = tf.placeholder(tf.int32, [None, 1]) # 0 ~ 6 Y_one_hot = tf.one_hot(Y, nb_classes) # one hot Y_one_hot = tf.reshape(Y_one_hot, [-1, nb_classes]) W = tf.Variable(tf.random_normal([16, nb_classes]), name='weight') b = tf.Variable(tf.random_normal([nb_classes]), name='bias') # tf.nn.softmax compute softmax activations # softmax = exp(logits) / reduce_sum(exp(logits), dim) logits = tf.matmul(X, W) + b hypothesis = tf.nn.softmax(logits) # Cross entropy cost/loss cost_i = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y_one_hot) cost = tf.reduce_mean(cost_i) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(cost) prediction = tf.argmax(hypothesis, 1) correct_prediction = tf.equal(prediction, tf.argmax(Y_one_hot, 1))
def train(flags): """Training entry point.""" log_dir = flags.log_dir flags.pretrained_model_dir = log_dir log_dir = os.path.join(log_dir, 'train') flags.eval_interval_secs = 0 with tf.Graph().as_default(): global_step = tf.Variable( 0, trainable=False, name='global_step', dtype=tf.int64) global_step_confidence = tf.Variable( 0, trainable=False, name='global_step_confidence', dtype=tf.int64) model = build_model(flags) images_query_pl, labels_query_pl, \ images_support_pl, labels_support_pl = \ build_episode_placeholder(flags) # Augments the input. if flags.dataset == 'cifar10' or flags.dataset == 'cifar100': images_query_pl_aug = data_loader.augment_cifar( images_query_pl, is_training=True) images_support_pl_aug = data_loader.augment_cifar( images_support_pl, is_training=True) elif flags.dataset == 'tinyimagenet': images_query_pl_aug = data_loader.augment_tinyimagenet( images_query_pl, is_training=True) images_support_pl_aug = data_loader.augment_tinyimagenet( images_support_pl, is_training=True) logits, logits_z = build_proto_train_graph( images_query=images_query_pl_aug, images_support=images_support_pl_aug, flags=flags, is_training=True, model=model) # Losses and optimizer ## Classification loss loss_classification = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=logits, labels=tf.one_hot(labels_query_pl, flags.num_classes_train))) # Confidence loss _, top_k_indices = tf.nn.top_k(logits, k=1) pred = tf.squeeze(top_k_indices) incorrect_mask = tf.math.logical_not(tf.math.equal(pred, labels_query_pl)) incorrect_logits_z = tf.boolean_mask(logits_z, incorrect_mask) incorrect_labels_z = tf.boolean_mask(labels_query_pl, incorrect_mask) signal_variance = tf.math.reduce_sum(tf.cast(incorrect_mask, tf.int32)) loss_variance_incorrect = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=incorrect_logits_z, labels=tf.one_hot(incorrect_labels_z, flags.num_classes_train))) loss_variance_zero = 0.0 loss_confidence = tf.cond( tf.greater(signal_variance, 0), lambda: loss_variance_incorrect, lambda: loss_variance_zero) regu_losses = tf.losses.get_regularization_losses() loss = tf.add_n([loss_classification] + regu_losses) # Learning rate if flags.lr_anneal == 'const': learning_rate = flags.init_learning_rate elif flags.lr_anneal == 'pwc': learning_rate = get_pwc_learning_rate(global_step, flags) elif flags.lr_anneal == 'exp': lr_decay_step = flags.number_of_steps // flags.n_lr_decay learning_rate = tf.train.exponential_decay( flags.init_learning_rate, global_step, lr_decay_step, 1.0 / flags.lr_decay_rate, staircase=True) else: raise Exception('Not implemented') # Optimizer optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=0.9) optimizer_confidence = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=0.9) train_op = contrib_slim.learning.create_train_op( total_loss=loss, optimizer=optimizer, global_step=global_step, clip_gradient_norm=flags.clip_gradient_norm) variable_variance = [] for v in tf.trainable_variables(): if 'fc_variance' in v.name: variable_variance.append(v) train_op_confidence = contrib_slim.learning.create_train_op( total_loss=loss_confidence, optimizer=optimizer_confidence, global_step=global_step_confidence, clip_gradient_norm=flags.clip_gradient_norm, variables_to_train=variable_variance) tf.summary.scalar('loss', loss) tf.summary.scalar('loss_classification', loss_classification) tf.summary.scalar('loss_variance', loss_confidence) tf.summary.scalar('regu_loss', tf.add_n(regu_losses)) tf.summary.scalar('learning_rate', learning_rate) # Merges all summaries except for pretrain summary = tf.summary.merge( tf.get_collection('summaries', scope='(?!pretrain).*')) # Gets datasets few_shot_data_train, test_dataset, train_dataset = get_train_datasets(flags) # Defines session and logging summary_writer_train = tf.summary.FileWriter(log_dir, flush_secs=1) saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True) print(saver.saver_def.filename_tensor_name) print(saver.saver_def.restore_op_name) # pylint: disable=unused-variable run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() supervisor = tf.train.Supervisor( logdir=log_dir, init_feed_dict=None, summary_op=None, init_op=tf.global_variables_initializer(), summary_writer=summary_writer_train, saver=saver, global_step=global_step, save_summaries_secs=flags.save_summaries_secs, save_model_secs=0) with supervisor.managed_session() as sess: checkpoint_step = sess.run(global_step) if checkpoint_step > 0: checkpoint_step += 1 eval_interval_steps = flags.eval_interval_steps for step in range(checkpoint_step, flags.number_of_steps): # Computes the classification loss using a batch of data. images_query, labels_query,\ images_support, labels_support = \ few_shot_data_train.next_few_shot_batch( query_batch_size_per_task=flags.train_batch_size, num_classes_per_task=flags.num_classes_train, num_supports_per_class=flags.num_shots_train, num_tasks=flags.num_tasks_per_batch) feed_dict = { images_query_pl: images_query.astype(dtype=np.float32), labels_query_pl: labels_query, images_support_pl: images_support.astype(dtype=np.float32), labels_support_pl: labels_support } t_batch = time.time() dt_batch = time.time() - t_batch t_train = time.time() loss, loss_confidence = sess.run([train_op, train_op_confidence], feed_dict=feed_dict) dt_train = time.time() - t_train if step % 100 == 0: summary_str = sess.run(summary, feed_dict=feed_dict) summary_writer_train.add_summary(summary_str, step) summary_writer_train.flush() logging.info('step %d, loss : %.4g, dt: %.3gs, dt_batch: %.3gs', step, loss, dt_train, dt_batch) if float(step) / flags.number_of_steps > 0.5: eval_interval_steps = flags.eval_interval_fine_steps if eval_interval_steps > 0 and step % eval_interval_steps == 0: saver.save(sess, os.path.join(log_dir, 'model'), global_step=step) eval( flags=flags, train_dataset=train_dataset, test_dataset=test_dataset) if float( step ) > 0.5 * flags.number_of_steps + flags.number_of_steps_to_early_stop: break
def get_prediction_module(self, bert_model, features, is_training, percent_done): final_hidden = bert_model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] # hidden_size = final_hidden_shape[2] # lstm_fw = tf.keras.layers.LSTM(hidden_size, return_sequences=True) # lstm_bw = tf.keras.layers.LSTM(hidden_size, return_sequences=True, go_backwards=True) # biLSTM = tf.keras.layers.Bidirectional(lstm_fw, backward_layer=lstm_bw, merge_mode='concat') # final_hidden = biLSTM(final_hidden) # biLSTM = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, return_sequences=True)) # lstm = tf.keras.layers.LSTM(100, return_sequences=True) # linear = tf.keras.layers.Dense(2, activation=None) # final_hidden = biLSTM(final_hidden) # final_hidden = linear(final_hidden) answer_mask = tf.cast(features["input_mask"], tf.float32) answer_mask *= tf.cast(features["segment_ids"], tf.float32) answer_mask += tf.one_hot(0, seq_length) start_logits = tf.squeeze(tf.layers.dense(final_hidden, 1), -1) start_top_log_probs = tf.zeros([batch_size, self.config.beam_size]) start_top_index = tf.zeros([batch_size, self.config.beam_size], tf.int32) end_top_log_probs = tf.zeros( [batch_size, self.config.beam_size, self.config.beam_size]) end_top_index = tf.zeros( [batch_size, self.config.beam_size, self.config.beam_size], tf.int32) if self.config.joint_prediction: start_logits += 1000.0 * (answer_mask - 1) start_log_probs = tf.nn.log_softmax(start_logits) start_top_log_probs, start_top_index = tf.nn.top_k( start_log_probs, k=self.config.beam_size) if not is_training: # batch, beam, length, hidden end_features = tf.tile(tf.expand_dims(final_hidden, 1), [1, self.config.beam_size, 1, 1]) # batch, beam, length start_index = tf.one_hot(start_top_index, depth=seq_length, axis=-1, dtype=tf.float32) # batch, beam, hidden start_features = tf.reduce_sum( tf.expand_dims(final_hidden, 1) * tf.expand_dims(start_index, -1), axis=-2) # batch, beam, length, hidden start_features = tf.tile(tf.expand_dims(start_features, 2), [1, 1, seq_length, 1]) else: start_index = tf.one_hot(features[self.name + "_start_positions"], depth=seq_length, axis=-1, dtype=tf.float32) start_features = tf.reduce_sum( tf.expand_dims(start_index, -1) * final_hidden, axis=1) start_features = tf.tile(tf.expand_dims(start_features, 1), [1, seq_length, 1]) end_features = final_hidden final_repr = tf.concat([start_features, end_features], -1) final_repr = tf.layers.dense(final_repr, 512, activation=modeling.gelu, name="qa_hidden") # batch, beam, length (batch, length when training) end_logits = tf.squeeze(tf.layers.dense(final_repr, 1), -1, name="qa_logits") if is_training: end_logits += 1000.0 * (answer_mask - 1) else: end_logits += tf.expand_dims(1000.0 * (answer_mask - 1), 1) if not is_training: end_log_probs = tf.nn.log_softmax(end_logits) end_top_log_probs, end_top_index = tf.nn.top_k( end_log_probs, k=self.config.beam_size) end_logits = tf.zeros([batch_size, seq_length]) else: end_logits = tf.squeeze(tf.layers.dense(final_hidden, 1), -1) start_logits += 1000.0 * (answer_mask - 1) end_logits += 1000.0 * (answer_mask - 1) def compute_loss(logits, positions): one_hot_positions = tf.one_hot(positions, depth=seq_length, dtype=tf.float32) log_probs = tf.nn.log_softmax(logits, axis=-1) loss = -tf.reduce_sum(one_hot_positions * log_probs, axis=-1) return loss start_positions = features[self.name + "_start_positions"] end_positions = features[self.name + "_end_positions"] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) losses = (start_loss + end_loss) / 2.0 answerable_logit = tf.zeros([batch_size]) if self.config.answerable_classifier: final_repr = final_hidden[:, 0] if self.config.answerable_uses_start_logits: start_p = tf.nn.softmax(start_logits) start_feature = tf.reduce_sum(tf.expand_dims(start_p, -1) * final_hidden, axis=1) final_repr = tf.concat([final_repr, start_feature], -1) final_repr = tf.layers.dense(final_repr, 512, activation=modeling.gelu) answerable_logit = tf.squeeze(tf.layers.dense(final_repr, 1), -1) answerable_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.cast(features[self.name + "_is_impossible"], tf.float32), logits=answerable_logit) losses += answerable_loss * self.config.answerable_weight return losses, dict( loss=losses, start_logits=start_logits, end_logits=end_logits, answerable_logit=answerable_logit, start_positions=features[self.name + "_start_positions"], end_positions=features[self.name + "_end_positions"], start_top_log_probs=start_top_log_probs, start_top_index=start_top_index, end_top_log_probs=end_top_log_probs, end_top_index=end_top_index, eid=features[self.name + "_eid"], )