def _model_fn(features, labels, mode, params, model, variable_filter_fn=None): """Model definition entry. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the model outputs class logits and box regression outputs. variable_filter_fn: the filter function that takes trainable_variables and returns the variable list after applying the filter rule. Returns: tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction. Raises: RuntimeError: if both ckpt and backbone_ckpt are set. """ # Convert params (dict) to Config for easier access. if params['data_format'] == 'channels_first': features = tf.transpose(features, [0, 3, 1, 2]) def _model_outputs(): return model(features, config=hparams_config.Config(params)) if params['use_bfloat16']: with tf.tpu.bfloat16_scope(): cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) else: cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'image': features, } for level in levels: predictions['cls_outputs_%d' % level] = cls_outputs[level] predictions['box_outputs_%d' % level] = box_outputs[level] return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. det_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels, params) l2loss = reg_l2_loss(params['weight_decay']) total_loss = det_loss + l2loss if mode == tf.estimator.ModeKeys.TRAIN: utils.scalar('lrn_rate', learning_rate) utils.scalar('trainloss/cls_loss', cls_loss) utils.scalar('trainloss/box_loss', box_loss) utils.scalar('trainloss/det_loss', det_loss) utils.scalar('trainloss/l2_loss', l2loss) utils.scalar('trainloss/loss', total_loss) moving_average_decay = params['moving_average_decay'] if moving_average_decay: ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=params['momentum']) if params['use_tpu']: optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) var_list = tf.trainable_variables() if variable_filter_fn: var_list = variable_filter_fn(var_list) if params.get('clip_gradients_norm', 0) > 0: logging.info('clip gradients norm by %f', params['clip_gradients_norm']) grads_and_vars = optimizer.compute_gradients(total_loss, var_list) with tf.name_scope('clip'): grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] clipped_grads, gnorm = tf.clip_by_global_norm( grads, params['clip_gradients_norm']) utils.scalar('gnorm', gnorm) grads_and_vars = list(zip(clipped_grads, tvars)) with tf.control_dependencies(update_ops): train_op = optimizer.apply_gradients(grads_and_vars, global_step) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step, var_list=var_list) if moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(**kwargs): """Returns a dictionary that has the evaluation metrics.""" batch_size = params['batch_size'] if params['use_tpu']: batch_size = params['batch_size'] * params['num_shards'] eval_anchors = anchors.Anchors(params['min_level'], params['max_level'], params['num_scales'], params['aspect_ratios'], params['anchor_scale'], params['image_size']) anchor_labeler = anchors.AnchorLabeler(eval_anchors, params['num_classes']) cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat']) box_loss = tf.metrics.mean(kwargs['box_loss_repeat']) if params.get('testdev_dir', None): logging.info('Eval testdev_dir %s', params['testdev_dir']) coco_metrics = coco_metric_fn( batch_size, anchor_labeler, params['val_json_file'], testdev_dir=params['testdev_dir'], disable_pyfun=params.get('disable_pyfun', None), **kwargs) else: logging.info('Eval val with groudtruths %s.', params['val_json_file']) coco_metrics = coco_metric_fn(batch_size, anchor_labeler, params['val_json_file'], **kwargs) # Add metrics to output. output_metrics = { 'cls_loss': cls_loss, 'box_loss': box_loss, } output_metrics.update(coco_metrics) return output_metrics cls_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(cls_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) box_loss_repeat = tf.reshape( tf.tile(tf.expand_dims(box_loss, 0), [ params['batch_size'], ]), [params['batch_size'], 1]) metric_fn_inputs = { 'cls_loss_repeat': cls_loss_repeat, 'box_loss_repeat': box_loss_repeat, 'source_ids': labels['source_ids'], 'groundtruth_data': labels['groundtruth_data'], 'image_scales': labels['image_scales'], } add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs) eval_metrics = (metric_fn, metric_fn_inputs) checkpoint = params.get('ckpt') or params.get('backbone_ckpt') if checkpoint and mode == tf.estimator.ModeKeys.TRAIN: # Initialize the model from an EfficientDet or backbone checkpoint. if params.get('ckpt') and params.get('backbone_ckpt'): raise RuntimeError( '--backbone_ckpt and --checkpoint are mutually exclusive') if params.get('backbone_ckpt'): var_scope = params['backbone_name'] + '/' if params['ckpt_var_scope'] is None: # Use backbone name as default checkpoint scope. ckpt_scope = params['backbone_name'] + '/' else: ckpt_scope = params['ckpt_var_scope'] + '/' else: # Load every var in the given checkpoint var_scope = ckpt_scope = '/' def scaffold_fn(): """Loads pretrained model through scaffold function.""" logging.info('restore variables from %s', checkpoint) var_map = utils.get_ckpt_var_map(ckpt_path=checkpoint, ckpt_scope=ckpt_scope, var_scope=var_scope, var_exclude_expr=params.get( 'var_exclude_expr', None)) tf.train.init_from_checkpoint(checkpoint, var_map) return tf.train.Scaffold() elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay: def scaffold_fn(): """Load moving average variables for eval.""" logging.info('Load EMA vars with ema_decay=%f', moving_average_decay) restore_vars_dict = ema.variables_to_restore(ema_vars) saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, eval_metrics=eval_metrics, host_call=utils.get_tpu_host_call( global_step, params), scaffold_fn=scaffold_fn)
def create_graph(self): BATCH_SIZE = self.BATCH_SIZE self.tf_x = tf.placeholder(tf.int32, [BATCH_SIZE, self.FEATURE_SIZE], name="tf_x") self.tf_y = tf.placeholder(tf.float32, [BATCH_SIZE, 2], name="tf_y") self.tf_bid_len = tf.placeholder(tf.int32, [BATCH_SIZE], name="tf_len") self.tf_market_price = tf.placeholder(tf.int32, [BATCH_SIZE], name="tf_market_price") self.tf_control_parameter = tf.placeholder(tf.float32, [2], name="tf_control_parameter") alpha = self.tf_control_parameter[0] beta = self.tf_control_parameter[1] self.tf_rnn_len = tf.maximum(self.tf_bid_len, self.tf_market_price) + 2 embeddings = tf.Variable(self.init_matrix([self.MAX_DEN, self.EMB_DIM])) x_emds = tf.nn.embedding_lookup(embeddings, self.tf_x) input = tf.reshape(x_emds, [BATCH_SIZE, self.FEATURE_SIZE * self.EMB_DIM]) input_x = None if self.add_time_feature: middle_layer = tf.layers.dense(input, self.MIDDLE_FEATURE_SIZE, tf.nn.relu) # hidden layer def add_time(x): y = tf.reshape(tf.tile(x, [self.MAX_SEQ_LEN]), [self.MAX_SEQ_LEN, self.MIDDLE_FEATURE_SIZE]) t = tf.reshape(tf.range(self.MAX_SEQ_LEN), [self.MAX_SEQ_LEN, 1]) z = tf.concat([y, tf.cast(t, dtype=tf.float32)], 1) return z input_x = tf.map_fn(add_time, middle_layer) preds = None if self.DNN_MODEL: outlist = [] for i in range(0, self.BATCH_SIZE): sigleout = tf.layers.dense(input_x[i], 1, tf.nn.sigmoid) outlist.append(sigleout) preds = tf.reshape(tf.stack(outlist, axis=0), [self.BATCH_SIZE, self.MAX_SEQ_LEN], name="preds") else: # input_x = tf.reshape(tf.tile(input, [1, self.MAX_SEQ_LEN]), [BATCH_SIZE, self.MAX_SEQ_LEN, self.FEATURE_SIZE * self.EMB_DIM]) rnn_cell = None #rnn_cell = tf.contrib.rnn.BasicLSTMCell(num_units=self.STATE_SIZE) rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.STATE_SIZE) outputs, (h_c, h_n) = tf.nn.dynamic_rnn( rnn_cell, # cell you have chosen input_x, # input initial_state=None, # the initial hidden state dtype=tf.float32, # must given if set initial_state = None time_major= False, # False: (batch, time step, input); True: (time step, batch, input) sequence_length=self.tf_rnn_len) new_output = tf.reshape( outputs, [self.MAX_SEQ_LEN * BATCH_SIZE, self.STATE_SIZE]) with tf.variable_scope('softmax'): W = tf.get_variable('W', [self.STATE_SIZE, 1]) b = tf.get_variable('b', [1], initializer=tf.constant_initializer(0)) logits = tf.matmul(new_output, W) + b preds = tf.transpose(tf.nn.sigmoid(logits, name="preds"), name="preds")[0] self.preds = preds survival_rate = preds batch_rnn_survival_rate = tf.reshape(survival_rate, [BATCH_SIZE, self.MAX_SEQ_LEN]) map_parameter = tf.concat([ batch_rnn_survival_rate, tf.cast(tf.reshape(self.tf_bid_len, [BATCH_SIZE, 1]), tf.float32) ], 1) map_parameter = tf.concat([ map_parameter, tf.cast(tf.reshape(self.tf_market_price, [BATCH_SIZE, 1]), tf.float32) ], 1) def reduce_mul(x): bid_len = tf.cast(x[self.MAX_SEQ_LEN], dtype=tf.int32) market_len = tf.cast(x[self.MAX_SEQ_LEN + 1], dtype=tf.int32) survival_rate_last_one = tf.reduce_prod(x[0:bid_len]) anlp_rate_last_one = tf.reduce_prod(x[0:market_len + 1]) anlp_rate_last_two = tf.reduce_prod(x[0:market_len]) ret = tf.stack([ survival_rate_last_one, anlp_rate_last_one, anlp_rate_last_two ]) return ret self.mp_para = map_parameter rate_result = tf.map_fn(reduce_mul, elems=map_parameter, name="rate_result") self.rate_result = rate_result log_minus = tf.log( tf.add( tf.transpose(rate_result)[2] - tf.transpose(rate_result)[1], 1e-20)) #todo debug self.anlp_node = -tf.reduce_sum( log_minus) / self.BATCH_SIZE #todo load name self.anlp_node = tf.add(self.anlp_node, 0, name="anlp_node") self.final_survival_rate = tf.transpose(rate_result)[0] final_dead_rate = tf.subtract(tf.constant(1.0, dtype=tf.float32), self.final_survival_rate) self.predict = tf.transpose(tf.stack( [self.final_survival_rate, final_dead_rate]), name="predict") cross_entropy = -tf.reduce_sum( self.tf_y * tf.log(tf.clip_by_value(self.predict, 1e-10, 1.0))) tvars = tf.trainable_variables() lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in tvars]) * self.L2_NORM cost = tf.add(cross_entropy, lossL2, name="cost") / self.BATCH_SIZE self.cost = tf.add(cost, 0, name="cost") optimizer = tf.train.AdamOptimizer(learning_rate=self.LR, beta2=0.99) #.minimize(cost) optimizer_anlp = tf.train.AdamOptimizer(learning_rate=self.ANLP_LR, beta2=0.99) #.minimize(cost) grads, _ = tf.clip_by_global_norm( tf.gradients(self.cost, tvars), self.GRAD_CLIP, ) self.train_op = optimizer.apply_gradients(zip(grads, tvars), name="train_op") tf.add_to_collection('train_op', self.train_op) anlp_grads, _ = tf.clip_by_global_norm( tf.gradients(self.anlp_node, tvars), self.GRAD_CLIP, ) self.anlp_train_op = optimizer_anlp.apply_gradients( zip(anlp_grads, tvars), name="anlp_train_op") tf.add_to_collection('anlp_train_op', self.anlp_train_op) self.com_cost = tf.add(alpha * self.cost, beta * self.anlp_node) com_grads, _ = tf.clip_by_global_norm( tf.gradients(self.com_cost, tvars), self.GRAD_CLIP, ) self.com_train_op = optimizer.apply_gradients(zip(com_grads, tvars), name="train_op") tf.add_to_collection('com_train_op', self.com_train_op) correct_pred = tf.equal(tf.argmax(self.predict, 1), tf.argmax(self.tf_y, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32), name="accuracy")
def fix_video_dims_and_concat_on_x_axis(x): x = tf.transpose(x, [1, 3, 4, 0, 2]) x = tf.reshape(x, [batch_size, frame_height, frame_channels, -1]) x = tf.transpose(x, [0, 3, 1, 2]) return x
def evolved_transformer_decoder(decoder_input, encoder_output, decoder_self_attention_bias, encoder_decoder_attention_bias, hparams, cache=None, decode_loop_step=None, name="decoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None): """Evolved Transformer decoder. See arxiv.org/abs/1901.11117 for more details. Args: decoder_input: a Tensor. encoder_output: a Tensor. decoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()). encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention (see common_attention.attention_bias()). hparams: hyperparameters for model. cache: dict, containing tensors which are the results of previous layers, used for fast decoding. decode_loop_step: An integer, step number of the decoding loop. Only used for inference on TPU. name: a string. nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This is used to mask out padding in convolutional layers. We generally only need this mask for "packed" datasets, because for ordinary datasets, no padding is ever followed by nonpadding. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: Not supported. Returns: Decoder output tensor. """ del losses num_trainable_top_decoder_layers = hparams.get( "num_trainable_top_decoder_layers", -1) # -1 means train all weights. if num_trainable_top_decoder_layers >= 0: encoder_output = tf.stop_gradient(encoder_output) attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): hidden_state = decoder_input num_layers = hparams.num_decoder_layers or hparams.num_hidden_layers for layer in range(num_layers): if num_trainable_top_decoder_layers == num_layers - layer: hidden_state = tf.stop_gradient(hidden_state) layer_name = "layer_%d" % layer layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): with tf.variable_scope(_SIXTEEN_HEAD_ATTENTION_NAME): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) attention_cache = layer_cache[ _SIXTEEN_HEAD_ATTENTION_NAME] if layer_cache is not None else None left_state = common_attention.multihead_attention( hidden_state, None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, _capped_double_heads(hparams.num_heads), hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, cache=attention_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), decode_loop_step=decode_loop_step, vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) if encoder_output is not None: with tf.variable_scope(_FIRST_ATTEND_TO_ENCODER_NAME): attention_cache = ( layer_cache[_FIRST_ATTEND_TO_ENCODER_NAME] if layer_cache is not None else None) right_state = common_attention.multihead_attention( hidden_state, encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, cache=attention_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) left_state = tf.nn.dropout(left_state, 1 - hparams.layer_prepostprocess_dropout) right_state = tf.nn.dropout( right_state, 1 - hparams.layer_prepostprocess_dropout) hidden_state = residual_state + left_state + right_state else: hidden_state = common_layers.layer_postprocess( residual_state, left_state, hparams) with tf.variable_scope(_CONV_BRANCHES_NAME): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) if nonpadding is not None: # Mask padding from conv layers. mask = tf.tile( tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size]) hidden_state *= mask if layer_cache: if decode_loop_step is None: hidden_state = layer_cache[ _CONV_BRANCHES_FIRST_LAYER_NAME] = tf.concat( [ layer_cache[_CONV_BRANCHES_FIRST_LAYER_NAME], hidden_state ], axis=1)[:, -1 * _DECODER_LEFT_CONV_PADDING - 1:, :] left_state = hidden_state right_state = hidden_state[:, _DECODER_LEFT_CONV_PADDING - _DECODER_RIGHT_CONV_PADDING:, :] else: # Inplace update is required for inference on TPU. # Inplace_ops only supports inplace_update on the first dimension. tmp = tf.transpose( layer_cache[_CONV_BRANCHES_FIRST_LAYER_NAME], perm=[1, 0, 2]) tmp = tf.expand_dims(tmp, axis=1) tmp = inplace_ops.alias_inplace_update( tmp, decode_loop_step * tf.shape(hidden_state)[1] + _DECODER_LEFT_CONV_PADDING, tf.transpose(hidden_state, perm=[1, 0, 2])) tmp = tf.squeeze(tmp, axis=1) hidden_state = layer_cache[ _CONV_BRANCHES_FIRST_LAYER_NAME] = tf.transpose( tmp, perm=[1, 0, 2]) batch_size = hidden_state.shape.as_list()[0] left_state = tf.slice(hidden_state, [0, decode_loop_step, 0], [ batch_size, _DECODER_LEFT_CONV_PADDING + 1, hparams.hidden_size ]) right_state = tf.slice(hidden_state, [ 0, decode_loop_step + _DECODER_LEFT_CONV_PADDING - _DECODER_RIGHT_CONV_PADDING, 0 ], [ batch_size, _DECODER_RIGHT_CONV_PADDING + 1, hparams.hidden_size ]) else: # No caching. left_state = tf.pad( hidden_state, paddings=[[0, 0], [_DECODER_LEFT_CONV_PADDING, 0], [0, 0]]) right_state = tf.pad( hidden_state, paddings=[[0, 0], [_DECODER_RIGHT_CONV_PADDING, 0], [0, 0]]) left_output_dim = int(hparams.hidden_size * 2) separable_conv_11x1 = tf.layers.SeparableConv1D( left_output_dim, 11, padding="VALID", name="separable_conv11x1", activation=tf.nn.relu) left_state = separable_conv_11x1.apply(left_state) left_state = tf.nn.dropout(left_state, 1 - hparams.layer_prepostprocess_dropout) right_output_dim = int(hparams.hidden_size / 2) separable_conv_7x1_1 = tf.layers.SeparableConv1D( right_output_dim, 7, padding="VALID", name="separable_conv_7x1_1") right_state = separable_conv_7x1_1.apply(right_state) right_state = tf.nn.dropout(right_state, 1 - hparams.layer_prepostprocess_dropout) right_state = tf.pad( right_state, [[0, 0], [0, 0], [0, left_output_dim - right_output_dim]], constant_values=0) hidden_state = left_state + right_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) if nonpadding is not None: # Mask padding from conv layers. mask = tf.tile( tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size * 2]) hidden_state *= mask if layer_cache: if decode_loop_step is None: hidden_state = layer_cache[ _CONV_BRANCHES_SECOND_LAYER_NAME] = tf.concat( [ layer_cache[_CONV_BRANCHES_SECOND_LAYER_NAME], hidden_state ], axis=1)[:, -1 * _DECODER_FINAL_CONV_PADDING - 1:, :] else: # Inplace update is required for inference on TPU. # Inplace_ops only supports inplace_update on the first dimension. tmp = tf.transpose( layer_cache[_CONV_BRANCHES_SECOND_LAYER_NAME], perm=[1, 0, 2]) tmp = tf.expand_dims(tmp, axis=1) tmp = inplace_ops.alias_inplace_update( tmp, (decode_loop_step + _DECODER_FINAL_CONV_PADDING) * tf.shape(hidden_state)[1], tf.transpose(hidden_state, perm=[1, 0, 2])) tmp = tf.squeeze(tmp, axis=1) hidden_state = layer_cache[ _CONV_BRANCHES_SECOND_LAYER_NAME] = tf.transpose( tmp, perm=[1, 0, 2]) batch_size = hidden_state.shape.as_list()[0] hidden_state = tf.slice(hidden_state, [0, decode_loop_step, 0], [ batch_size, _DECODER_FINAL_CONV_PADDING + 1, hparams.hidden_size * 2 ]) else: hidden_state = tf.pad( hidden_state, paddings=[[0, 0], [_DECODER_FINAL_CONV_PADDING, 0], [0, 0]]) separable_conv_7x1_2 = tf.layers.SeparableConv1D( hparams.hidden_size, 7, padding="VALID", name="separable_conv_7x1_2") hidden_state = separable_conv_7x1_2.apply(hidden_state) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope(_VANILLA_ATTENTION_NAME): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) attention_cache = layer_cache[ _VANILLA_ATTENTION_NAME] if layer_cache is not None else None hidden_state = common_attention.multihead_attention( hidden_state, None, decoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, cache=attention_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), decode_loop_step=decode_loop_step, vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) if encoder_output is not None: with tf.variable_scope(_SECOND_ATTEND_TO_ENCODER_NAME): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) attention_cache = ( layer_cache[_SECOND_ATTEND_TO_ENCODER_NAME] if layer_cache is not None else None) hidden_state = common_attention.multihead_attention( hidden_state, encoder_output, encoder_decoder_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, cache=attention_cache, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("dense_layers"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) hidden_state = tf.layers.dense( hidden_state, int(hparams.hidden_size * 4), activation=tf.nn.swish) hidden_state = tf.nn.dropout(hidden_state, 1 - hparams.layer_prepostprocess_dropout) hidden_state = common_layers.layer_preprocess(hidden_state, hparams) hidden_state = tf.layers.dense(hidden_state, hparams.hidden_size) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) decoder_output = common_layers.layer_preprocess(hidden_state, hparams) if num_trainable_top_decoder_layers == 0: decoder_output = tf.stop_gradient(decoder_output) return decoder_output
def discrete_bottleneck(self, x): """Discretization bottleneck for latent variables. Args: x: Input to the discretization bottleneck. Returns: Embedding to pass to the decoder, discrete latent, loss, and the embedding function. Raises: ValueError: If projection_tensors is None for reshape_method project, or ema_count or ema_means is None if we are using ema, or unknown args. """ x_reshaped = self.slice_hidden(x) x_means_hot = [] x_means = 0 loss = 0 x_means_hot, x_means, q_loss, e_loss = self.embedding_lookup( x_reshaped, self.means) if self.hparams.ema: tf.logging.info("Using EMA with beta = {}".format( self.hparams.beta)) updated_ema_count = \ moving_averages.assign_moving_average( self.ema_count, tf.reduce_sum( tf.reshape( x_means_hot, shape=[-1, self.hparams.num_blocks, self.hparams.block_v_size]), axis=0), self.hparams.decay, zero_debias=False) dw = tf.matmul(tf.transpose(x_means_hot, perm=[1, 2, 0]), tf.transpose(x_reshaped, perm=[1, 0, 2])) updated_ema_means = \ moving_averages.assign_moving_average( self.ema_means, dw, self.hparams.decay, zero_debias=False) n = tf.reduce_sum(updated_ema_count, axis=-1, keep_dims=True) updated_ema_count = ( (updated_ema_count + self.hparams.epsilon) / (n + 2**self.hparams.z_size * self.hparams.epsilon) * n) updated_ema_means = updated_ema_means / tf.expand_dims( updated_ema_count, axis=-1) with tf.control_dependencies([e_loss]): update_means = tf.assign(self.means, updated_ema_means) with tf.control_dependencies([update_means]): loss += self.hparams.beta * e_loss else: # Use a gradient based loss for learning the cluster centers loss += q_loss + self.hparams.beta * e_loss # Get the discrete latent representation x_means_idx = tf.argmax(x_means_hot, axis=-1) # Get the binary representation num_bits = int(self.hparams.z_size // self.hparams.num_blocks) x_means_bits = self.int_to_bit(x_means_idx, num_bits=num_bits, base=2) x_discrete = self.bit_to_int(tf.to_int32(x_means_bits), num_bits=self.hparams.z_size, base=2) # Reshape x_discrete shape_x = common_layers.shape_list(x) shape_discrete = shape_x[:-1] x_discrete = tf.reshape(x_discrete, shape_discrete) x_means = tf.reshape(x_means, shape=shape_x) h1 = x + tf.stop_gradient(x_means - x) h2 = tf.layers.dense(tf.nn.relu(h1), self.hparams.filter_size, name="vch2") res = tf.layers.dense(tf.nn.relu(h2), self.hparams.hidden_size, name="vcfin") embed_fn = partial(self.embed) return { "dense": res, "discrete": x_discrete, "loss": loss, "embed": embed_fn }
def resample_feature_map(feat, name, target_height, target_width, target_num_channels, apply_bn=False, is_training=None, conv_after_downsample=False, use_native_resize_op=False, pooling_type=None, use_tpu=False, data_format='channels_last'): """Resample input feature map to have target number of channels and size.""" if data_format == 'channels_first': _, num_channels, height, width = feat.get_shape().as_list() else: _, height, width, num_channels = feat.get_shape().as_list() if height is None or width is None or num_channels is None: raise ValueError( 'shape[1] or shape[2] or shape[3] of feat is None (shape:{}).'.format( feat.shape)) if apply_bn and is_training is None: raise ValueError('If BN is applied, need to provide is_training') def _maybe_apply_1x1(feat): """Apply 1x1 conv to change layer width if necessary.""" if num_channels != target_num_channels: feat = tf.layers.conv2d( feat, filters=target_num_channels, kernel_size=(1, 1), padding='same', data_format=data_format) if apply_bn: feat = utils.batch_norm_act( feat, is_training_bn=is_training, act_type=None, data_format=data_format, use_tpu=use_tpu, name='bn') return feat with tf.variable_scope('resample_{}'.format(name)): # If conv_after_downsample is True, when downsampling, apply 1x1 after # downsampling for efficiency. if height > target_height and width > target_width: if not conv_after_downsample: feat = _maybe_apply_1x1(feat) height_stride_size = int((height - 1) // target_height + 1) width_stride_size = int((width - 1) // target_width + 1) if pooling_type == 'max' or pooling_type is None: # Use max pooling in default. feat = tf.layers.max_pooling2d( inputs=feat, pool_size=[height_stride_size + 1, width_stride_size + 1], strides=[height_stride_size, width_stride_size], padding='SAME', data_format=data_format) elif pooling_type == 'avg': feat = tf.layers.average_pooling2d( inputs=feat, pool_size=[height_stride_size + 1, width_stride_size + 1], strides=[height_stride_size, width_stride_size], padding='SAME', data_format=data_format) else: raise ValueError('Unknown pooling type: {}'.format(pooling_type)) if conv_after_downsample: feat = _maybe_apply_1x1(feat) elif height <= target_height and width <= target_width: feat = _maybe_apply_1x1(feat) if height < target_height or width < target_width: height_scale = target_height // height width_scale = target_width // width if (use_native_resize_op or target_height % height != 0 or target_width % width != 0): if data_format == 'channels_first': feat = tf.transpose(feat, [0, 2, 3, 1]) feat = tf.image.resize_nearest_neighbor(feat, [target_height, target_width]) if data_format == 'channels_first': feat = tf.transpose(feat, [0, 3, 1, 2]) else: feat = nearest_upsampling( feat, height_scale=height_scale, width_scale=width_scale, data_format=data_format) else: raise ValueError( 'Incompatible target feature map size: target_height: {},' 'target_width: {}'.format(target_height, target_width)) return feat
def update_placeholder_shape_and_add_transpose(node: Node): """ The function changes placeholders shapes from NHWC to NCHW format and add transpose operations if needed. :param node: node to operate on. :return: None """ try: import tensorflow.compat.v1 as tf_v1 # disable eager execution of TensorFlow 2 environment immediately tf_v1.disable_eager_execution() except ImportError: import tensorflow as tf_v1 from openvino.tools.mo.front.common.layout import convert_shape, nhwc_to_nchw_permute, nchw_to_nhwc_permute from openvino.tools.mo.front.tf.extractors.utils import tf_tensor_shape from openvino.tools.mo.front.tf.partial_infer.tf import add_node_def_to_subgraph, update_input_in_pbs tf_v1.reset_default_graph() inputs_replacements = list() # transpose permutation constant nchw_to_nhwc_constant = tf_v1.constant(nchw_to_nhwc_permute, dtype=tf_v1.int32, name=nchw_to_nhwc_constant_name) nhwc_to_nchw_constant = tf_v1.constant(nhwc_to_nchw_permute, dtype=tf_v1.int32, name=nhwc_to_nchw_constant_name) for placeholder_name in node['input_nodes_names']: # dummy node which we can refer to as input in the transpose for the output node # dummy node should be unique for each placeholder dummy_node = tf_v1.constant(value=[[[[1]]]], dtype=tf_v1.float32, name='random_dummy_name_' + placeholder_name) placeholder = node['pbs'][placeholder_name] cur_shape = tf_tensor_shape(placeholder.attr['shape'].shape) if len( cur_shape ) == 4: # TODO think about better check that transpose is required nchw_shape = convert_shape(cur_shape, nhwc_to_nchw_permute) for ind in range(len(cur_shape)): placeholder.attr['shape'].shape.dim[ind].size = nchw_shape[ ind] transpose_name = placeholder.name + '_transpose' transpose = tf_v1.transpose(dummy_node, nchw_to_nhwc_constant, transpose_name) # NCHW -> NHWC # add transpose operations to GraphDef after placeholders add_node_def_to_subgraph(node, transpose.op.node_def, transpose_name, len(node['input_nodes_names'])) inputs_replacements.append((placeholder.name, transpose_name)) inputs_replacements.append((dummy_node.name, placeholder.name)) node['real_input_dims'].append(nchw_shape) else: node['real_input_dims'].append(cur_shape) add_node_def_to_subgraph(node, nchw_to_nhwc_constant.op.node_def) add_node_def_to_subgraph(node, nhwc_to_nchw_constant.op.node_def) # update initial input names to a transposed ones for old_input_tensor_name, new_name in inputs_replacements: update_input_in_pbs(node, old_input_tensor_name, new_name)
def random_mask2(shape, k): x = tf.random_normal(shape=shape) x = tf.transpose(x) kth_largest = tf.nn.top_k(x, k)[0][:, k-1] mask = tf.to_float(tf.greater_equal(x, tf.expand_dims(kth_largest, 1))) return tf.transpose(mask)
def create_de_model(bert_config, is_training, input_ids_1, input_mask_1, segment_ids_1, input_ids_2, input_masks_2, segment_ids_2, num_candidates, labels, use_one_hot_embeddings): """Creates a ranking model using cosine and dual encoder representations.""" sequence_length_query = FLAGS.max_seq_length_query sequence_length_passage = FLAGS.max_seq_length - FLAGS.max_seq_length_query input_ids_1 = tf.reshape(input_ids_1, [-1, sequence_length_query]) segment_ids_1 = tf.reshape(segment_ids_1, [-1, sequence_length_query]) input_masks_1 = tf.reshape(input_mask_1, [-1, sequence_length_query]) batch_size = tf.shape(input_masks_1)[0] input_ids_2 = tf.reshape(input_ids_2, [-1, sequence_length_passage]) segment_ids_2 = tf.reshape(segment_ids_2, [-1, sequence_length_passage]) input_masks_2 = tf.reshape(input_masks_2, [-1, sequence_length_passage]) # [batch_size, num_candidates] labels = tf.dtypes.cast(labels, tf.float32) # [batch_size, num_vec_query, hidden_size], [batch_size, num_vec_query] output_layer_1, mask_1 = encode_block(bert_config, input_ids_1, input_masks_1, segment_ids_1, use_one_hot_embeddings, FLAGS.num_vec_query, is_training) output_layer_2, mask_2 = encode_block(bert_config, input_ids_2, input_masks_2, segment_ids_2, use_one_hot_embeddings, FLAGS.num_vec_passage, is_training) label_mask = tf.expand_dims(tf.eye(batch_size), axis=2) label_mask = tf.tile(label_mask, [1, 1, num_candidates]) label_mask = tf.reshape(label_mask, [batch_size, -1]) label_mask = tf.cast(label_mask, tf.float32) labels = tf.tile(labels, [1, batch_size]) labels = tf.multiply(labels, label_mask) output_layer_2_logits = tf.reshape( output_layer_2, [batch_size, num_candidates, FLAGS.num_vec_passage, -1]) mask_2_logits = tf.reshape( mask_2, [batch_size, num_candidates, FLAGS.num_vec_passage]) mask_logits = tf.einsum("BQ,BCP->BCQP", tf.cast(mask_1, tf.float32), tf.cast(mask_2_logits, tf.float32)) logits = tf.einsum("BQH,BCPH->BCQP", output_layer_1, output_layer_2_logits) logits = tf.multiply(logits, mask_logits) logits = tf.reduce_max(logits, axis=-1) logits = tf.reduce_sum(logits, axis=-1) if FLAGS.use_tpu and is_training: num_shards = tpu_utils.num_tpu_shards() output_layer_2 = tpu_utils.cross_shard_concat(output_layer_2) mask_2 = tpu_utils.cross_shard_concat(tf.cast(mask_2, tf.float32)) mask_2 = tf.cast(mask_2, tf.bool) labels = tpu_utils.cross_shard_pad(labels) tf.logging.info("Global batch size: %s", tensor_utils.shape(labels, 0)) tf.logging.info("Num shards: %s", num_shards) tf.logging.info("Number of candidates in batch: %s", tensor_utils.shape(output_layer_2, 0)) labels = tf.reshape(labels, [num_shards, batch_size, -1]) labels = tf.transpose(labels, perm=[1, 0, 2]) labels = tf.reshape(labels, [batch_size, -1]) with tf.variable_scope("loss"): if is_training: output_layer_1 = tf.nn.dropout(output_layer_1, keep_prob=FLAGS.dropout) output_layer_2 = tf.nn.dropout(output_layer_2, keep_prob=FLAGS.dropout) cosine_similarity = tf.einsum("AQH,BPH->ABQP", output_layer_1, output_layer_2) mask = tf.cast( tf.logical_and(tf.expand_dims(tf.expand_dims(mask_1, 2), 1), tf.expand_dims(tf.expand_dims(mask_2, 1), 0)), tf.float32) cosine_similarity = tf.multiply(cosine_similarity, mask) cosine_similarity = tf.reduce_max(cosine_similarity, axis=-1) cosine_similarity = tf.reduce_sum(cosine_similarity, axis=-1) per_example_loss = tf.losses.softmax_cross_entropy( labels, cosine_similarity) return (per_example_loss, logits)
def loss_som(self): """Computes the SOM loss.""" k = tf.range(self.som_dim[0] * self.som_dim[1]) k_1 = k // self.som_dim[0] k_2 = k % self.som_dim[1] k1_not_top = tf.less(k_1, tf.constant(self.som_dim[0] - 1, dtype=tf.int32)) k1_not_bottom = tf.greater(k_1, tf.constant(0, dtype=tf.int32)) k2_not_right = tf.less( k_2, tf.constant(self.som_dim[1] - 1, dtype=tf.int32)) k2_not_left = tf.greater(k_2, tf.constant(0, dtype=tf.int32)) k1_up = tf.where(k1_not_top, tf.add(k_1, 1), tf.zeros(tf.shape(k_1), dtype=tf.dtypes.int32)) k1_down = tf.where( k1_not_bottom, tf.subtract(k_1, 1), tf.ones(tf.shape(k_1), dtype=tf.dtypes.int32) * (self.som_dim[0] - 1)) k2_right = tf.where(k2_not_right, tf.add(k_2, 1), tf.zeros(tf.shape(k_2), dtype=tf.dtypes.int32)) k2_left = tf.where( k2_not_left, tf.subtract(k_2, 1), tf.ones(tf.shape(k_2), dtype=tf.dtypes.int32) * (self.som_dim[0] - 1)) k_up = k1_up * self.som_dim[0] + k_2 k_down = k1_down * self.som_dim[0] + k_2 k_right = k_1 * self.som_dim[0] + k2_right k_left = k_1 * self.som_dim[0] + k2_left q_t = tf.transpose(self.q_ng) q_up = tf.transpose( tf.gather_nd( q_t, tf.reshape(k_up, [self.som_dim[0] * self.som_dim[1], 1]))) q_down = tf.transpose( tf.gather_nd( q_t, tf.reshape(k_down, [self.som_dim[0] * self.som_dim[1], 1]))) q_right = tf.transpose( tf.gather_nd( q_t, tf.reshape(k_right, [self.som_dim[0] * self.som_dim[1], 1]))) q_left = tf.transpose( tf.gather_nd( q_t, tf.reshape(k_left, [self.som_dim[0] * self.som_dim[1], 1]))) q_neighbours = tf.concat([ tf.expand_dims(q_up, -1), tf.expand_dims(q_down, -1), tf.expand_dims(q_right, -1), tf.expand_dims(q_left, -1) ], axis=2) q_neighbours = tf.reduce_sum(tf.math.log(q_neighbours), axis=-1) mask = tf.greater(self.q, 0.1 * tf.ones_like(self.q)) new_q = tf.multiply(self.q, tf.cast(mask, tf.float32)) q_n = tf.math.multiply(q_neighbours, tf.stop_gradient(new_q)) q_n = tf.reduce_sum(q_n, axis=-1) qq = tf.math.negative(tf.reduce_mean(q_n)) return qq
def joint_extraction_model_fn(features, labels, mode, params): """Runs the node-level sequence labeling model.""" logging.info("joint_extraction_model_fn") inputs = features # Arg "features" is the overall inputs. # Read vocabs and inputs. dropout = params["dropout"] if params["circle_features"]: nnodes, friend_has_label, (words, nwords), ( prev_text_words, n_prev_text_words), (chars_list, chars_len_list), ( partner_words, _), (friends_words, n_friends_words), ( friends_fix, friends_var), (leaf_type_list, goldmine_feat_list), ( _, _), (node_xpath_list, node_xpath_len_list), ( attributes, attributes_plus_none), (position_list) = inputs else: nnodes, (words, nwords), (prev_text_words, n_prev_text_words), ( chars_list, chars_len_list), (leaf_type_list, goldmine_feat_list), (_, _), ( node_xpath_list, node_xpath_len_list), (attributes), (position_list) = inputs # nnodes, the number of nodes in each page; # shape is [?]; length is the number of pages. # words, nwords are the node_text feature, shape is [?, ?, ?] # the first two dimension is the batch * pages, # the last one is the maximum length of the word lists # prev_text_words, n_prev_text_words, similar as above for previous nodes'text # chars_list, chars_len_list, shape is [?,?,?,?] also for node_text features # the additional dim is for the length of the character sequences. # friends_words, shape is [?, ?, ?], gathers all the words from different # friends of one node. # friends_fix, friends_var, shapes are [?, ?, ?, ?] # the first two dimension is the batch * pages, # the last two are the maximum length of friend nodes and words. nnodes = merge_first_two_dims(nnodes) training = (mode == tf_estimator.ModeKeys.TRAIN) vocab_words = _index_table_from_file( params["words"], num_oov_buckets=params["num_oov_buckets"]) with tf.gfile.Open(params["tags"]) as f: indices = [idx for idx, tag in enumerate(f) if tag.strip() != "none"] num_tags = len( indices) + 1 # Make "None" as the tag with the last index. # NodeText Char Embeddings. with tf.gfile.Open(params["chars"]) as f: num_chars = sum(1 for _ in f) + params["num_oov_buckets"] vocab_chars = _index_table_from_file( params["chars"], num_oov_buckets=params["num_oov_buckets"]) char_ids = vocab_chars.lookup(chars_list) variable = tf.get_variable("chars_embeddings", [num_chars + 1, params["dim_chars"]], tf.float32) char_embeddings = tf.nn.embedding_lookup(variable, char_ids) char_embeddings = tf.layers.dropout(char_embeddings, rate=dropout, training=training) logging.info("char_embeddings.shape: %s", char_embeddings.shape) # Char 1d convolution. weights = tf.sequence_mask(chars_len_list) char_embeddings = masked_conv1d_and_max(char_embeddings, weights, params["filters"], params["kernel_size"]) logging.info("char_embeddings.shape after CNN: %s", char_embeddings.shape) # Word Embeddings. word_ids = vocab_words.lookup(words) glove = np.load(tf.gfile.Open(params["glove"], "rb"))["embeddings"] # np.array variable = np.vstack([glove, [[0.] * params["dim_word_embedding"]]]) # To finetune the GloVe embedding by setting trainable as True. variable = tf.Variable(variable, dtype=tf.float32, trainable=True) word_embeddings = tf.nn.embedding_lookup(variable, word_ids) logging.info("word_embeddings.shape: %s", word_embeddings.shape) # Prev_Text Representations. prev_text_word_ids = vocab_words.lookup(prev_text_words) prev_text_word_embeddings = tf.nn.embedding_lookup(variable, prev_text_word_ids) if params["use_prev_text_lstm"]: # PREV_text LSTM. logging.info("prev_text_representation using lstm") prev_t = merge_first_two_dims(prev_text_word_embeddings) # Seq * batch * input prev_t = tf.transpose(prev_t, perm=[1, 0, 2]) # Need time-major. prev_output_fw, prev_output_bw = _bidirectional_lstm( prev_t, params["lstm_size"], merge_first_two_dims(n_prev_text_words)) prev_output = tf.concat([prev_output_fw, prev_output_bw], axis=-1) prev_output = tf.reduce_mean(prev_output, 0) prev_output = tf.layers.dropout(prev_output, rate=dropout, training=training) logging.info("prev_output.shape (after reduce_mean): %s", prev_output.shape) context_representation = split_first_two_dims_by_example( prev_output, prev_text_word_embeddings) logging.info("context_representation.shape (after split): %s", context_representation.shape) else: logging.info("prev_text_word_embeddings.shape: %s", prev_text_word_embeddings.shape) context_representation = tf.reduce_mean(prev_text_word_embeddings, 2) logging.info("context_representation.shape: %s", context_representation.shape) if params["circle_features"]: partner_embeddings, circle_representation = circle_feature_modeling( variable, vocab_words, partner_words, friends_words, n_friends_words, friends_fix, friends_var, word_embeddings, dropout, training, params) context_representation = circle_representation if params["use_friend_semantic"]: friends_ids = vocab_words.lookup(friends_words) friend_embeddings = tf.nn.embedding_lookup(variable, friends_ids) if params["use_xpath_lstm"]: h_output = xpath_feature_modeling(node_xpath_list, node_xpath_len_list, training, params) context_representation = tf.concat([h_output, context_representation], axis=2) if params["use_position_embedding"]: position_representation = position_modeling(position_list, params) context_representation = tf.concat( [context_representation, position_representation], axis=2) # Text Embeddings: Concatenate Word and Char and Feature Embeddings. embeddings = tf.concat([word_embeddings, char_embeddings], axis=-1) embeddings = tf.layers.dropout(embeddings, rate=dropout, training=training) logging.info("embeddings.shape: %s", embeddings.shape) # LSTM inside node texts. t = merge_first_two_dims(embeddings) t = tf.transpose(t, perm=[1, 0, 2]) # Need time-major. output_fw, output_bw = _bidirectional_lstm(t, params["lstm_size"], merge_first_two_dims(nwords)) output = tf.concat([output_fw, output_bw], axis=-1) output = tf.reduce_mean(output, 0) output = tf.layers.dropout(output, rate=dropout, training=training) logging.info("output.shape (after reduce_mean): %s", output.shape) output = split_first_two_dims_by_example(output, embeddings) logging.info("output.shape (after split): %s", output.shape) node_seq_input = tf.concat([output, context_representation], axis=2) logging.info("output.shape (after + prev): %s", node_seq_input.shape) # Leaf Type Features. if params["add_leaf_types"]: with tf.gfile.Open(params["leaf_types"]) as f: num_leaf_types = sum(1 for _ in f) + params["num_oov_buckets"] vocab_leaf_types = _index_table_from_file( params["leaf_types"], num_oov_buckets=params["num_oov_buckets"]) leaf_type_ids = vocab_leaf_types.lookup(leaf_type_list) leaf_variable = tf.get_variable( "leaf_type_embeddings", [num_leaf_types + 1, params["dim_leaf_type"]], tf.float32) leaf_type_embeddings = tf.nn.embedding_lookup(leaf_variable, leaf_type_ids) leaf_type_embeddings = tf.layers.dropout(leaf_type_embeddings, rate=dropout, training=training) logging.info("leaf_type_embeddings.shape: %s", char_embeddings.shape) logging.info("node_seq_input.shape before leaf: %s", node_seq_input.shape) node_seq_input = tf.concat([node_seq_input, leaf_type_embeddings], axis=2) logging.info("node_seq_input.shape after leaf: %s", node_seq_input.shape) # Goldmine Feat Embeddings. if params["add_goldmine"]: vocab_goldmine_features = _index_table_from_file( params["goldmine_features"], num_oov_buckets=1) goldmine_feature_variable = tf.get_variable( "goldmine_feature_embeddings", [8 + 1, params["dim_goldmine"]], tf.float32) goldmine_feat_ids = vocab_goldmine_features.lookup(goldmine_feat_list) goldmine_feat_embeddings = tf.nn.embedding_lookup( goldmine_feature_variable, goldmine_feat_ids) goldmine_feat_embeddings = tf.reduce_sum(goldmine_feat_embeddings, 2) logging.info("goldmine_feat_embeddings.shape: %s", goldmine_feat_embeddings.shape) node_seq_input = tf.concat([node_seq_input, goldmine_feat_embeddings], axis=2) logging.info("node_seq_input.shape after goldmine: %s", node_seq_input.shape) # Node-level LSTM modeling. if params["node_encoder"] == "lstm": # Node-Sequence-LSTM. n_t = tf.transpose(node_seq_input, perm=[1, 0, 2]) # Need time-major. node_output_fw, node_output_bw = _bidirectional_lstm( n_t, params["node_lstm_size"], nnodes) node_seq_output = tf.concat([node_output_fw, node_output_bw], axis=-1) node_seq_output = tf.transpose(node_seq_output, perm=[1, 0, 2]) elif params["node_encoder"] == "cnn": node_weights = tf.sequence_mask(nnodes) node_seq_output = masked_conv1d_and_max(node_seq_input, node_weights, params["node_filters"], params["node_kernel_size"], reducemax=False) elif params["node_encoder"] == "transformer": # Node-Sequence-Transformer. node_seq_output = transformer_encoding(node_seq_input, nnodes, params, mode) else: node_seq_output = node_seq_input logging.info("node_seq_input.shape after encoder: %s", node_seq_output.shape) if params["node_encoder"] != "transformer": # Add the dropout layer if the encoder is not a transformer. node_seq_output = tf.layers.dropout(node_seq_output, rate=dropout, training=training) if params["use_friends_discrete_feature"] and params["circle_features"]: friend_has_label = tf.expand_dims(friend_has_label, axis=-1) node_seq_output = tf.concat([node_seq_output, friend_has_label], axis=-1) logging.info("node_seq_input.shape after friend_has_label: %s", node_seq_output.shape) node_seq_output = tf.layers.dense(node_seq_output, params["last_hidden_layer_size"]) logits = tf.layers.dense(node_seq_output, num_tags, name="label_dense_1") if params["semantic_encoder"] and params["circle_features"]: partner_similarity_emb = semantic_similarity(variable, vocab_words, partner_embeddings, attributes, params) node_seq_output = tf.concat( [node_seq_output, tf.nn.softmax(partner_similarity_emb)], axis=-1) logging.info("node_seq_output.shape after semantic encoder: %s", node_seq_output.shape) if params["use_friend_semantic"]: friends_similarity_emb = semantic_similarity( variable, vocab_words, friend_embeddings, attributes, params) node_seq_output = tf.concat( [node_seq_output, friends_similarity_emb], axis=-1) if params["objective"] == "classification": node_seq_output = tf.layers.dense(node_seq_output, params["dim_word_embedding"], activation="relu") node_seq_output = tf.layers.dense(node_seq_output, params["last_hidden_layer_size"]) logging.info("node_seq_output.shape after semantic encoder: %s", node_seq_output.shape) logits = tf.layers.dense(node_seq_output, num_tags, name="label_dense_2") elif params["objective"] == "semantic_scorer": logits = semantic_scorer(attributes_plus_none, node_seq_output, params) elif params["objective"] == "binary_scorer": logits = binary_scorer(attributes_plus_none, node_seq_output, training, params) if params["use_crf"]: # CRF Layer. logging.info("logits.shape: %s", logits.shape) crf_params = tf.get_variable("crf", [num_tags, num_tags], dtype=tf.float32) pred_ids, _ = tfa.text.crf.crf_decode(logits, crf_params, nnodes) logging.info("pred_ids.shape: %s", pred_ids.shape) else: pred_ids = tf.argmax(logits, 2) logging.info("pred_ids.shape: %s", pred_ids.shape) # Predict for new sentences in target set. if mode == tf_estimator.ModeKeys.PREDICT: reverse_vocab_tags = _index_table_from_file(params["tags"], 1) pred_strings = reverse_vocab_tags.lookup( tf.strings.as_string(pred_ids)) predictions = { "pred_ids": pred_ids, "tags": pred_strings, "scores": tf.nn.softmax(logits), "raw_scores": logits, } # Store the intermediate weights. if params["semantic_encoder"]: predictions["similarity"] = partner_similarity_emb if params["friend_encoder"]: predictions["friends_embs"] = circle_representation if params["extract_node_emb"]: predictions["node_embs"] = node_seq_output return tf_estimator.EstimatorSpec(mode, predictions=predictions) vocab_tags = _index_table_from_file(params["tags"], 1) tags = vocab_tags.lookup(labels) logging.info("tags.shape: %s", logits.shape) logging.info( "Parameter size: %s", np.sum([ np.prod(v.get_shape().as_list()) for v in tf.trainable_variables() ])) if params["use_crf"]: log_likelihood, _ = tfa.text.crf.crf_log_likelihood( logits, tags, nnodes, crf_params) loss = tf.reduce_mean(-log_likelihood) else: loss = tf.losses.sparse_softmax_cross_entropy(labels=tags, logits=logits) # Processing the metrics. weights = tf.sequence_mask(nnodes) metrics = { "acc": tf.metrics.accuracy(tags, pred_ids, weights), "precision": seq_tagging_metric_util.precision(tags, pred_ids, num_tags, indices, weights), "recall": seq_tagging_metric_util.recall(tags, pred_ids, num_tags, indices, weights), "f1": seq_tagging_metric_util.f1(tags, pred_ids, num_tags, indices, weights), } for metric_name, op in metrics.items(): tf.summary.scalar(metric_name, op[1]) if mode == tf_estimator.ModeKeys.TRAIN: with tf.name_scope("train_scope"): optimizer = tf.train.AdamOptimizer() train_op = optimizer.minimize( loss, global_step=tf.train.get_or_create_global_step()) return tf_estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) return tf_estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=metrics)
def _up_convolve_transpose_explicit(self, inputs, kernel, prepadding): # Computes upsampling followed by convolution, via transpose convolution ops # in EXPLICIT mode. This is an efficient implementation of upsampled # convolutions, where we only compute values that are necessary. do_cast = inputs.dtype.is_integer # conv2d_backprop_input expects the output and input channels in reversed # order. We implement this by swapping those dimensions of the kernel. kernel = tf.transpose( kernel, list(range(self._rank)) + [self._rank + 1, self._rank]) # Compute explicit padding corresponding to the equivalent conv2d call, # and the shape of the output, taking into account any pre-padding. input_shape = tf.shape(inputs) padding = (self._rank + 2) * [(0, 0)] output_shape = [input_shape[0]] + (self._rank + 1) * [None] if self.data_format == "channels_last": spatial_axes = range(1, self._rank + 1) output_shape[-1] = self.filters else: spatial_axes = range(2, self._rank + 2) output_shape[1] = self.filters if self.extra_pad_end: get_length = lambda l, s, k, p: l * s + ((k - 1) - p) else: get_length = lambda l, s, k, p: l * s + ((k - 1) - (s - 1) - p) for i, a in enumerate(spatial_axes): if self.padding == "valid": padding[a] = 2 * (self.kernel_support[i] - 1, ) else: # same padding[a] = ( prepadding[i][0] * self.strides_up[i] + self.kernel_support[i] // 2, prepadding[i][1] * self.strides_up[i] + (self.kernel_support[i] - 1) // 2, ) output_shape[a] = get_length(input_shape[a], self.strides_up[i], self.kernel_support[i], sum(padding[a])) data_format = self._op_data_format strides = self._padded_tuple(self.strides_up, 1) # Compute convolution. if self._rank == 1 and not self.channel_separable: # There's no 1D equivalent to conv2d_backprop_input, so we insert an # extra dimension and use the 2D op. extradim = { "channels_first": 2, "channels_last": 1 }[self.data_format] data_format = data_format.replace("W", "HW") strides = strides[:extradim] + ( strides[extradim], ) + strides[extradim:] padding = padding[:extradim] + [(0, 0)] + padding[extradim:] output_shape = output_shape[:extradim] + [ 1 ] + output_shape[extradim:] kernel = tf.expand_dims(kernel, 0) inputs = tf.expand_dims(inputs, extradim) if do_cast: inputs = tf.cast(inputs, tf.float32) outputs = tf.nn.conv2d_backprop_input(output_shape, kernel, inputs, strides=strides, padding=padding, data_format=data_format) if do_cast: outputs = tf.cast(tf.math.round(outputs), self.accum_dtype) outputs = tf.squeeze(outputs, [extradim]) elif self._rank == 2 and not self.channel_separable: if do_cast: inputs = tf.cast(inputs, tf.float32) outputs = tf.nn.conv2d_backprop_input(output_shape, kernel, inputs, strides=strides, padding=padding, data_format=data_format) if do_cast: outputs = tf.cast(tf.math.round(outputs), self.accum_dtype) else: self._raise_notimplemented() # Perform downsampling if it is requested. if any(s > 1 for s in self.strides_down): slices = tuple(slice(None, None, s) for s in self.strides_down) slices = self._padded_tuple(slices, slice(None)) outputs = outputs[slices] return outputs
def _up_convolve_transpose_valid(self, inputs, kernel, prepadding): # Computes upsampling followed by convolution, via transpose convolution ops # in VALID mode. This is a relatively inefficient implementation of # upsampled convolutions, where we need to crop away a lot of the values # computed in the boundaries. # Transpose convolutions expect the output and input channels in reversed # order. We implement this by swapping those dimensions of the kernel. # For channel separable convolutions, we can't currently perform anything # other than one filter per channel, so the last dimension needs to be of # length one. Since this happens to be the format that the op expects it, # we can skip the transpose in that case. if not self.channel_separable: kernel = tf.transpose( kernel, list(range(self._rank)) + [self._rank + 1, self._rank]) # Compute shape of temporary. input_shape = tf.shape(inputs) temp_shape = [input_shape[0]] + (self._rank + 1) * [None] if self.data_format == "channels_last": spatial_axes = range(1, self._rank + 1) temp_shape[-1] = (input_shape[-1] if self.channel_separable else self.filters) else: spatial_axes = range(2, self._rank + 2) temp_shape[ 1] = input_shape[1] if self.channel_separable else self.filters if self.extra_pad_end: get_length = lambda l, s, k: l * s + (k - 1) else: get_length = lambda l, s, k: l * s + ((k - 1) - (s - 1)) for i, a in enumerate(spatial_axes): temp_shape[a] = get_length(input_shape[a], self.strides_up[i], self.kernel_support[i]) data_format = self._op_data_format strides = self._padded_tuple(self.strides_up, 1) # Compute convolution. if self._rank == 1 and not self.channel_separable: # There's no 1D equivalent to conv2d_backprop_input, so we insert an # extra dimension and use the 2D op. extradim = { "channels_first": 2, "channels_last": 1 }[self.data_format] data_format = data_format.replace("W", "HW") strides = strides[:extradim] + ( strides[extradim], ) + strides[extradim:] temp_shape = temp_shape[:extradim] + [1] + temp_shape[extradim:] kernel = tf.expand_dims(kernel, 0) inputs = tf.expand_dims(inputs, extradim) outputs = tf.nn.conv2d_backprop_input(temp_shape, kernel, inputs, strides=strides, padding="VALID", data_format=data_format) outputs = tf.squeeze(outputs, [extradim]) elif self._rank == 1 and self.channel_separable and self.filters == 1: # There's no 1D equivalent to depthwise_conv2d_native_backprop_input, so # we insert an extra dimension and use the 2D op. extradim = { "channels_first": 2, "channels_last": 1 }[self.data_format] data_format = data_format.replace("W", "HW") strides = strides[:extradim] + ( strides[extradim], ) + strides[extradim:] temp_shape = temp_shape[:extradim] + [1] + temp_shape[extradim:] kernel = tf.expand_dims(kernel, 0) inputs = tf.expand_dims(inputs, extradim) outputs = tf.nn.depthwise_conv2d_native_backprop_input( temp_shape, kernel, inputs, strides=strides, padding="VALID", data_format=data_format) outputs = tf.squeeze(outputs, [extradim]) elif self._rank == 2 and not self.channel_separable: outputs = tf.nn.conv2d_backprop_input(temp_shape, kernel, inputs, strides=strides, padding="VALID", data_format=data_format) elif (self._rank == 2 and self.channel_separable and self.filters == 1 and self.strides_up[0] == self.strides_up[1]): outputs = tf.nn.depthwise_conv2d_native_backprop_input( temp_shape, kernel, inputs, strides=strides, padding="VALID", data_format=data_format) elif self._rank == 3 and not self.channel_separable: outputs = tf.nn.conv3d_transpose(inputs, kernel, temp_shape, strides=strides, padding="VALID", data_format=data_format) else: self._raise_notimplemented() # Perform crop, taking into account any pre-padding that was applied. slices = (self._rank + 2) * [slice(None)] for i, a in enumerate(spatial_axes): if self.padding == "valid": # Take `kernel_support - 1` samples away from both sides. This leaves # just samples computed without any padding. start = stop = self.kernel_support[i] - 1 else: # same # Take half of kernel sizes plus the pre-padding away from each side. start = prepadding[i][0] * self.strides_up[i] start += self.kernel_support[i] // 2 stop = prepadding[i][1] * self.strides_up[i] stop += (self.kernel_support[i] - 1) // 2 step = self.strides_down[i] start = start if start > 0 else None stop = -stop if stop > 0 else None step = step if step > 1 else None slices[a] = slice(start, stop, step) if not all(s.start is s.stop is s.step is None for s in slices): outputs = outputs[tuple(slices)] return outputs
shape=[1, 1], dtype=tf.float32, initializer=tf.zeros_initializer()) ## Forward prop Z1 = tf.add(tf.matmul(W1, x), b1) A1 = tf.nn.relu(Z1) Z2 = tf.add(tf.matmul(W2, A1), b2) A2 = tf.nn.relu(Z2) Z3 = tf.add(tf.matmul(W3, A2), b3) ## Compute Cost cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.transpose(Z3), labels=tf.transpose(y))) ## Backward Prop back_prop = tf.train.GradientDescentOptimizer(learning_rate=.01).minimize(cost) with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) cost_main = [] for i in range(2000): _, cost_iter = sess.run([back_prop, cost], feed_dict={ x: X_train, y: y_train })
def _generate_detections_tf(cls_outputs, box_outputs, anchor_boxes, indices, classes, image_id, image_scale, min_score_thresh=MIN_SCORE_THRESH, max_boxes_to_draw=MAX_DETECTIONS_PER_IMAGE, soft_nms_sigma=0.0, iou_threshold=0.5, use_native_nms=True): """Generates detections with model outputs and anchors. Args: cls_outputs: a numpy array with shape [N, 1], which has the highest class scores on all feature levels. The N is the number of selected top-K total anchors on all levels. (k being MAX_DETECTION_POINTS) box_outputs: a numpy array with shape [N, 4], which stacks box regression outputs on all feature levels. The N is the number of selected top-k total anchors on all levels. (k being MAX_DETECTION_POINTS) anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all feature levels. The N is the number of selected top-k total anchors on all levels. indices: a numpy array with shape [N], which is the indices from top-k selection. classes: a numpy array with shape [N], which represents the class prediction on all selected anchors from top-k selection. image_id: an integer number to specify the image id. image_scale: a float tensor representing the scale between original image and input image for the detector. It is used to rescale detections for evaluating with the original groundtruth annotations. min_score_thresh: A float representing the threshold for deciding when to remove boxes based on score. max_boxes_to_draw: Max number of boxes to draw. soft_nms_sigma: A scalar float representing the Soft NMS sigma parameter; See Bodla et al, https://arxiv.org/abs/1704.04503). When `soft_nms_sigma=0.0` (which is default), we fall back to standard (hard) NMS. iou_threshold: A float representing the threshold for deciding whether boxes overlap too much with respect to IOU. use_native_nms: a bool that indicates whether to use native nms. Returns: detections: detection results in a tensor with each row representing [image_id, y, x, height, width, score, class] """ logging.info('Using tf version of post-processing.') anchor_boxes = tf.gather(anchor_boxes, indices) scores = tf.math.sigmoid(cls_outputs) # apply bounding box regression to anchors boxes = decode_box_outputs_tf( tf.transpose(box_outputs, [1, 0]), tf.transpose(anchor_boxes, [1, 0])) if use_native_nms: logging.info('Using native nms.') top_detection_idx, scores = tf.image.non_max_suppression_with_scores( boxes, scores, max_boxes_to_draw, iou_threshold=iou_threshold, score_threshold=min_score_thresh, soft_nms_sigma=soft_nms_sigma) boxes = tf.gather(boxes, top_detection_idx) else: logging.info('Using customized nms.') scores = tf.expand_dims(scores, axis=1) all_detections = tf.concat([boxes, scores], axis=1) top_detection_idx = nms_tf(all_detections, iou_threshold) detections = tf.gather(all_detections, top_detection_idx) scores = detections[:, 4] boxes = detections[:, :4] height = boxes[:, 2] - boxes[:, 0] width = boxes[:, 3] - boxes[:, 1] detections = tf.stack([ tf.cast(tf.repeat(image_id, tf.size(top_detection_idx)), tf.float32), boxes[:, 0] * image_scale, boxes[:, 1] * image_scale, height * image_scale, width * image_scale, scores, tf.cast(tf.gather(classes, top_detection_idx) + 1, tf.float32) ], axis=1) return detections
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, multilabel, sent_rels, sentiment, entailment_rels, entailment, corr_rels, correlation): """Creates a classification model.""" model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids) # Here, we are doing a classification task on the entire segment. For # token-level output, use model.get_sequece_output() instead. output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer()) with tf.variable_scope("loss"): if is_training: # I.e., 0.1 dropout output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) # Labels both for single and multilabel classification labels = tf.cast(labels, tf.float32) if multilabel: probabilities = tf.nn.sigmoid(logits) tf.logging.info("num_labels:{};logits:{};labels:{}".format( num_labels, logits, labels)) per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits( labels=labels, logits=logits) else: probabilities = tf.nn.softmax(logits, axis=-1) per_example_loss = tf.nn.softmax_cross_entropy_with_logits( labels=labels, logits=logits) loss = tf.reduce_mean(per_example_loss) # Add regularization based on label relations prior probs_exp = tf.expand_dims(probabilities, 1) m = tf.tile(probs_exp, [1, num_labels, 1]) probs_exp_t = tf.transpose(probs_exp, perm=[0, 2, 1]) # Subtract each prediction from all others: # Example (with batch size=1): # tiled predictions: [0.1] [0.1] [0.1] # [0.2] [0.2] [0.2] # [0.3] [0.3] [0.3] # subtract [0.1, 0.2, 0.3] row-wise # result: [0.0] [-.1] [-.2] --> row represents difference between # target 1 and all other targets # [0.1] [0.0] [-.1] # [0.2] [0.1] [0.0] dists = tf.square(tf.subtract(m, probs_exp_t)) # square distances dists = tf.transpose(dists, perm=[0, 2, 1]) # Sentiment-based regularization sent_reg = tf.multiply( tf.constant(sentiment), tf.reduce_mean( tf.multiply(dists, tf.constant(sent_rels, dtype=tf.float32)))) tf.summary.scalar("sentiment_regularization", sent_reg) loss += sent_reg # Entailment-based regularization ent_reg = tf.multiply( tf.constant(entailment), tf.reduce_mean( tf.multiply(dists, tf.constant(entailment_rels, dtype=tf.float32)))) tf.summary.scalar("entailment_regularization", ent_reg) loss += ent_reg # Correlation-based regularization corr_reg = tf.multiply( tf.constant(correlation), tf.reduce_mean( tf.multiply(dists, tf.constant(corr_rels, dtype=tf.float32)))) tf.summary.scalar("correlation_regularization", corr_reg) loss += corr_reg tf.summary.scalar("loss", loss) return (loss, per_example_loss, logits, probabilities)
def attention_layer(x_flat, attention_mask, batch_size, seq_length, size_per_head=512, num_attention_heads=1, *, cache=None, initializer_range=0.02, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, do_cache=False): """ :param x_flat: Tensor input, should be [batch_size*seq_length, dim] :param attention_mask: Attention mask to use of size [seq_length, seq_length+cached_length] :param size_per_head: dim = size_per_head * num_attention_heads :param num_attention_heads: dim = size_per_head * num_attention_heads :param cache: Optionally some past (cached) things of size [batch, 2, heads, sequence, features], where 2 is [k, v] :param do_cache: True if we should return cache :return: A new tensor of shape [batch_size, seq_length, dim] as well as a new cache "cached_keys_and_values" that will be of size [batch_size, 2, num_attention_heads, seq_length, dim] """ batch_size_seq_length, dim = get_shape_list(x_flat, expected_rank=2) # Had to remove this because of generation script # if (batch_size_seq_length != batch_size * seq_length): # raise ValueError("passed in a tensor of shape {} when batch_size={} and seq_length={}".format( # (batch_size_seq_length, dim), batch_size, seq_length # )) if dim != size_per_head * num_attention_heads: raise ValueError( "passed in a tensor of shape {} when size_per_head={} and num_attention_heads={}" .format((batch_size_seq_length, dim), size_per_head, num_attention_heads)) # if do_cache and past is not None: # Shape will be (batch_size, 2, num_attention_heads, past_seq_length, dim) # past_shape = get_shape_list(past, 5) # desired_shape = (batch_size, 2, num_attention_heads, seq_length, dim) # if tuple(past_shape) != desired_shape: # raise ValueError(f"The shape of the cache is {past_shape} but we want {desired_shape}") # [ batch_size, num_attention_heads, seq_length, size_per_head] query = _attention_projection_and_transpose( x_flat, batch_size=batch_size, seq_length=seq_length, num_attention_heads=num_attention_heads, size_per_head=size_per_head, name='query_layer', initializer_range=initializer_range) key = _attention_projection_and_transpose( x_flat, batch_size=batch_size, seq_length=seq_length, num_attention_heads=num_attention_heads, size_per_head=size_per_head, name='key_layer', initializer_range=initializer_range) value = _attention_projection_and_transpose( x_flat, batch_size=batch_size, seq_length=seq_length, num_attention_heads=num_attention_heads, size_per_head=size_per_head, name='value_layer', initializer_range=initializer_range) # Add to cache cached_keys_and_values = tf.stack([key, value], axis=1) if do_cache else None # Things that were relevant from the cache if cache is not None: pk, pv = tf.unstack(cache, axis=1) key = tf.concat([pk, key], axis=-2) value = tf.concat([pv, value], axis=-2) # Multiply [batch_size, num_attention_heads, seq_length, size_per_head] with # [batch_size, num_attention_heads, size_per_head, seq_length+cached_length] -> # [batch_size, num_attention_heads, seq_length, seq_length+cached_length] attention_scores = tf.matmul(query, key, transpose_b=True) attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) attention_scores = mask_attention_for_ltr(attention_scores, attention_mask) attention_probs = tf.nn.softmax(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. # NOPENOPENOPENOPE # attention_probs = factoreddropout(attention_probs, attention_probs_dropout_prob) # Multiply [batch_size, num_attention_heads, seq_length, seq_length+cached_length] with # [batch_size, num_attention_heads, seq_length+cached_length, size_per_head] -> # [batch_size, num_attention_heads, seq_length, size_per_head] -> context_layer = tf.matmul(attention_probs, value) # `context_layer` = [batch_size, seq_length, num_attention_heads, size_per_head] context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) context_layer = tf.reshape( context_layer, [batch_size * seq_length, num_attention_heads * size_per_head]) context_layer_projected = tf.layers.dense( context_layer, num_attention_heads * size_per_head, kernel_initializer=create_initializer(initializer_range), name='context_projection_layer') context_layer_projected = dropout(context_layer_projected, hidden_dropout_prob) return context_layer_projected, cached_keys_and_values
def _generate_detections_tf(cls_outputs, box_outputs, anchor_boxes, indices, classes, image_id, image_scale, num_classes, min_score_thresh=0.2, max_boxes_to_draw=50, soft_nms_sigma=0.0, iou_threshold=0.5, use_native_nms=False): """Generates detections with model outputs and anchors. Args: cls_outputs: a numpy array with shape [N, 1], which has the highest class scores on all feature levels. The N is the number of selected top-K total anchors on all levels. (k being MAX_DETECTION_POINTS) box_outputs: a numpy array with shape [N, 4], which stacks box regression outputs on all feature levels. The N is the number of selected top-k total anchors on all levels. (k being MAX_DETECTION_POINTS) anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all feature levels. The N is the number of selected top-k total anchors on all levels. indices: a numpy array with shape [N], which is the indices from top-k selection. classes: a numpy array with shape [N], which represents the class prediction on all selected anchors from top-k selection. image_id: an integer number to specify the image id. image_scale: a float tensor representing the scale between original image and input image for the detector. It is used to rescale detections for evaluating with the original groundtruth annotations. num_classes: a integer that indicates the number of classes. min_score_thresh: A float representing the threshold for deciding when to remove boxes based on score. max_boxes_to_draw: Max number of boxes to draw. soft_nms_sigma: A scalar float representing the Soft NMS sigma parameter; See Bodla et al, https://arxiv.org/abs/1704.04503). When `soft_nms_sigma=0.0` (which is default), we fall back to standard (hard) NMS. iou_threshold: A float representing the threshold for deciding whether boxes overlap too much with respect to IOU. use_native_nms: a bool that indicates whether to use native nms. Returns: detections: detection results in a tensor with each row representing [image_id, y, x, height, width, score, class] """ anchor_boxes = tf.gather(anchor_boxes, indices) scores = tf.math.sigmoid(cls_outputs) # apply bounding box regression to anchors boxes = decode_box_outputs_tf( tf.transpose(box_outputs, [1, 0]), tf.transpose(anchor_boxes, [1, 0])) def _else(detections, class_id, indices): """Else branch for generating detections.""" boxes_cls = tf.gather(boxes, indices) scores_cls = tf.gather(scores, indices) # Select top-scoring boxes in each class and apply non-maximum suppression # (nms) for boxes in the same class. The selected boxes from each class are # then concatenated for the final detection outputs. if use_native_nms: top_detection_idx, scores_cls = tf.image.non_max_suppression_with_scores( boxes_cls, scores_cls, max_boxes_to_draw, iou_threshold=iou_threshold, score_threshold=min_score_thresh, soft_nms_sigma=soft_nms_sigma) scores_cls = tf.expand_dims(scores_cls, axis=1) boxes_cls = tf.gather(boxes_cls, top_detection_idx) top_detections_cls = tf.concat([boxes_cls, scores_cls], axis=1) else: scores_cls = tf.expand_dims(scores_cls, axis=1) all_detections_cls = tf.concat([boxes_cls, scores_cls], axis=1) top_detection_idx = nms_tf(all_detections_cls, iou_threshold) top_detections_cls = tf.gather(all_detections_cls, top_detection_idx) height = top_detections_cls[:, 2] - top_detections_cls[:, 0] width = top_detections_cls[:, 3] - top_detections_cls[:, 1] top_detections_cls = tf.stack([top_detections_cls[:, 0] * image_scale, top_detections_cls[:, 1] * image_scale, height * image_scale, width * image_scale, top_detections_cls[:, 4]], axis=-1) top_detections_cls = tf.stack( [ tf.cast( tf.repeat(image_id, tf.size(top_detection_idx)), tf.float32), *tf.unstack(top_detections_cls, 5, axis=1), tf.repeat(class_id + 1.0, tf.size(top_detection_idx)) ], axis=1) detections = tf.concat([detections, top_detections_cls], axis=0) return detections detections = tf.constant([], tf.float32, [0, 7]) for c in range(num_classes): indices_cls = tf.squeeze(tf.where_v2(tf.equal(classes, c)), axis=-1) detections = tf.cond( tf.equal(tf.size(indices), 0), lambda: detections, lambda id=c, id_cls=indices_cls: _else(detections, id, id_cls)) indices_final = tf.argsort(detections[:, -2], direction='DESCENDING') detections = tf.gather( detections, indices_final[:max_boxes_to_draw], name='detection') return detections
def resnet_model_fn(features, labels, mode, params): """The model_fn for ResNet to be used with TPUEstimator. Args: features: `Tensor` of batched images. If transpose_input is enabled, it is transposed to device layout and reshaped to 1D tensor. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features['feature'] # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU/TPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. if params['data_format'] == 'channels_first': assert not params['transpose_input'] # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) if params['transpose_input'] and mode != tf_estimator.ModeKeys.PREDICT: image_size = tf.sqrt(tf.shape(features)[0] / (3 * tf.shape(labels)[0])) features = tf.reshape(features, [image_size, image_size, 3, -1]) features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC # Normalize the image to zero mean and unit variance. features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype) features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype) # DropBlock keep_prob for the 4 block groups of ResNet architecture. # None means applying no DropBlock at the corresponding block group. dropblock_keep_probs = [None] * 4 if params['dropblock_groups']: # Scheduled keep_prob for DropBlock. train_steps = tf.cast(params['train_steps'], tf.float32) current_step = tf.cast(tf.train.get_global_step(), tf.float32) current_ratio = current_step / train_steps dropblock_keep_prob = (1 - current_ratio * (1 - params['dropblock_keep_prob'])) # Computes DropBlock keep_prob for different block groups of ResNet. dropblock_groups = [ int(x) for x in params['dropblock_groups'].split(',') ] for block_group in dropblock_groups: if block_group < 1 or block_group > 4: raise ValueError( 'dropblock_groups should be a comma separated list of integers ' 'between 1 and 4 (dropblcok_groups: {}).'.format( params['dropblock_groups'])) dropblock_keep_probs[block_group - 1] = 1 - ( (1 - dropblock_keep_prob) / 4.0**(4 - block_group)) # This nested function allows us to avoid duplicating the logic which # builds the network, for different values of --precision. def build_network(): network = resnet_model.resnet_v1( resnet_depth=params['resnet_depth'], num_classes=params['num_label_classes'], dropblock_size=params['dropblock_size'], dropblock_keep_probs=dropblock_keep_probs, data_format=params['data_format']) return network(inputs=features, is_training=(mode == tf_estimator.ModeKeys.TRAIN)) if params['precision'] == 'bfloat16': with contrib_tpu.bfloat16_scope(): logits = build_network() logits = tf.cast(logits, tf.float32) elif params['precision'] == 'float32': logits = build_network() if mode == tf_estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf_estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf_estimator.export.PredictOutput(predictions) }) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, params['num_label_classes']) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=params['label_smoothing']) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + params['weight_decay'] * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) host_call = None if mode == tf_estimator.ModeKeys.TRAIN: # Compute the current epoch and associated learning rate from global_step. global_step = tf.train.get_global_step() steps_per_epoch = params['num_train_images'] / params[ 'train_batch_size'] current_epoch = (tf.cast(global_step, tf.float32) / steps_per_epoch) # LARS is a large batch optimizer. LARS enables higher accuracy at batch 16K # and larger batch sizes. if params['enable_lars']: learning_rate = 0.0 optimizer = lars_util.init_lars_optimizer(current_epoch, params) raise ValueError( 'LARS unexpected in the context of IGT experiments.') else: learning_rate = linear_learning_rate_schedule(params, global_step) if FLAGS.optimizer == 'momentum': tf.logging.info('Using MomentumOptimizer ({}).'.format( params['momentum'])) optimizer = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=params['momentum'], use_nesterov=False) elif FLAGS.optimizer == 'adam': tf.logging.info('Using AdamOptimizer') optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) elif FLAGS.optimizer == 'eigt': tf.logging.info('Using ExpIgtOptimizer {} tail: {}'.format( FLAGS.igt_optimizer, FLAGS.tail_fraction)) optimizer = exp_igt_optimizer.ExpIgtOptimizer( learning_rate, tail_fraction=FLAGS.tail_fraction, optimizer=FLAGS.igt_optimizer) else: raise ValueError('{} is not a supported optimizer'.format( FLAGS.optimizer)) if params['use_tpu']: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = contrib_tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if not params['skip_host_call']: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] # Host call fns are executed params['iterations_per_loop'] times after # one TPU loop is finished, setting max_queue value to the same as # number of iterations will make the summary writer only flush the data # to storage once per loop. with summary.create_file_writer( get_model_dir(params), max_queue=params['iterations_per_loop']).as_default(): with summary.always_record_summaries(): summary.scalar('loss', loss[0], step=gs) summary.scalar('learning_rate', lr[0], step=gs) summary.scalar('current_epoch', ce[0], step=gs) return summary.all_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) else: train_op = None eval_metrics = None scaffold_fn = None if mode == tf_estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) if FLAGS.mode == 'eval_igt' and FLAGS.igt_eval_mode == 'true': tf.logging.info('Using true param loading saver.') def scaffold_fn_true_params(): """Returns a scaffold that loads the true values into vars.""" var_mapping = {} trainable_vars = set(tf.trainable_variables()) for var in tf.global_variables(): if var in trainable_vars: var_mapping[var.op.name + '/true_param'] = var else: var_mapping[var.op.name] = var tf.logging.info('Mapping: {}'.format(var_mapping)) saver = tf.train.Saver(var_list=var_mapping, sharded=True) return tf.train.Scaffold(saver=saver) scaffold_fn = scaffold_fn_true_params return contrib_tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def _build_network(self, layers): network = tf.transpose(self.input_tensor, [0, 2, 3, 1]) # [batch, assets, window, features] network = network / network[:, :, -1, 0, None, None] for layer_number, layer in enumerate(layers): if layer["type"] == "DenseLayer": network = tflearn.layers.core.fully_connected( network, int(layer["neuron_number"]), layer["activation_function"], regularizer=layer["regularizer"], weight_decay=layer["weight_decay"]) self.add_layer_to_dict(layer["type"], network) elif layer["type"] == "DropOut": network = tflearn.layers.core.dropout( network, layer["keep_probability"]) elif layer["type"] == "EIIE_Dense": width = network.get_shape()[2] network = tflearn.layers.conv_2d( network, int(layer["filter_number"]), [1, width], [1, 1], "valid", layer["activation_function"], regularizer=layer["regularizer"], weight_decay=layer["weight_decay"]) self.add_layer_to_dict(layer["type"], network) elif layer["type"] == "ConvLayer": network = tflearn.layers.conv_2d( network, int(layer["filter_number"]), allint(layer["filter_shape"]), allint(layer["strides"]), layer["padding"], layer["activation_function"], regularizer=layer["regularizer"], weight_decay=layer["weight_decay"]) self.add_layer_to_dict(layer["type"], network) elif layer["type"] == "MaxPooling": network = tflearn.layers.conv.max_pool_2d( network, layer["strides"]) elif layer["type"] == "AveragePooling": network = tflearn.layers.conv.avg_pool_2d( network, layer["strides"]) elif layer["type"] == "LocalResponseNormalization": network = tflearn.layers.normalization.local_response_normalization( network) elif layer["type"] == "EIIE_Output": width = network.get_shape()[2] network = tflearn.layers.conv_2d( network, 1, [1, width], padding="valid", regularizer=layer["regularizer"], weight_decay=layer["weight_decay"]) self.add_layer_to_dict(layer["type"], network) network = network[:, :, 0, 0] btc_bias = tf.ones((self.input_num, 1)) self.add_layer_to_dict(layer["type"], network) network = tf.concat([btc_bias, network], 1) network = tflearn.layers.core.activation(network, activation="softmax") self.add_layer_to_dict(layer["type"], network, weights=False) elif layer["type"] == "Output_WithW": network = tflearn.flatten(network) network = tf.concat([network, self.previous_w], axis=1) network = tflearn.fully_connected( network, self._rows + 1, activation="softmax", regularizer=layer["regularizer"], weight_decay=layer["weight_decay"]) elif layer["type"] == "EIIE_Output_WithW": width = network.get_shape()[2] height = network.get_shape()[1] features = network.get_shape()[3] network = tf.reshape( network, [self.input_num, int(height), 1, int(width * features)]) w = tf.reshape(self.previous_w, [-1, int(height), 1, 1]) network = tf.concat([network, w], axis=3) network = tflearn.layers.conv_2d( network, 1, [1, 1], padding="valid", regularizer=layer["regularizer"], weight_decay=layer["weight_decay"]) self.add_layer_to_dict(layer["type"], network) network = network[:, :, 0, 0] #btc_bias = tf.zeros((self.input_num, 1)) btc_bias = tf.get_variable("btc_bias", [1, 1], dtype=tf.float32, initializer=tf.zeros_initializer) # self.add_layer_to_dict(layer["type"], network, weights=False) btc_bias = tf.tile(btc_bias, [self.input_num, 1]) network = tf.concat([btc_bias, network], 1) self.voting = network self.add_layer_to_dict('voting', network, weights=False) network = tflearn.layers.core.activation(network, activation="softmax") self.add_layer_to_dict('softmax_layer', network, weights=False) elif layer["type"] == "EIIE_LSTM" or\ layer["type"] == "EIIE_RNN": network = tf.transpose(network, [0, 2, 3, 1]) resultlist = [] reuse = False for i in range(self._rows): if i > 0: reuse = True if layer["type"] == "EIIE_LSTM": result = tflearn.layers.lstm( network[:, :, :, i], int(layer["neuron_number"]), dropout=layer["dropouts"], scope="lstm" + str(layer_number), reuse=reuse) else: result = tflearn.layers.simple_rnn( network[:, :, :, i], int(layer["neuron_number"]), dropout=layer["dropouts"], scope="rnn" + str(layer_number), reuse=reuse) resultlist.append(result) network = tf.stack(resultlist) network = tf.transpose(network, [1, 0, 2]) network = tf.reshape( network, [-1, self._rows, 1, int(layer["neuron_number"])]) else: raise ValueError("the layer {} not supported.".format( layer["type"])) return network
def _scan_step_fn(state, example, packed_length, queue_size, spacing, num_sequences, token_dtype): # pylint: disable=g-doc-args """Transform function used by tf.data.experimental.scan to process an example. This is written as a stateless function rather than a class method because we trace it with AutoGraph (in order to simplify the conditional), and this way we don't have to worry about handling re-tracing semantics. Args: See the SequenceDatasetPacker class. Returns: The updated queue state, and either a packed example or a dummy sequence which will be filtered out downstream. """ # Convert TensorArray tuples to lists since we'll need to replace them. availability, contents, top_index = state lengths = tf.concat([tf.shape(i) for i in example], axis=0) start_availability = availability.stack() can_fit = tf.reduce_all(tf.greater_equal(start_availability, lengths), axis=1) any_can_fit = tf.reduce_any(can_fit, axis=0) # AutoGraph will convert this block to a tf.cond if any_can_fit: # This indicates where in the FFD queue rotation a given index sits shifted_range = ( tf.range(queue_size, dtype=INDEX_DTYPE) - top_index) % queue_size # Mark any indices which cannot accommodate the current example. exclusion_mask = tf.cast(tf.logical_not(can_fit), INDEX_DTYPE) * queue_size # Index in [0, queue_size) in which to place the sample. Note, this index # is the position in the actual TensorArray, not the index of the FFD queue. queue_index = (tf.reduce_min(shifted_range + exclusion_mask) + top_index) % queue_size # NOTE(taylorrobie): We emit a non-empty Tensor for downstream checks. output_contents = -tf.ones((1, num_sequences), dtype=token_dtype) else: index_range = top_index * packed_length + tf.range(packed_length) output_contents = contents.gather(index_range) # Reset the queue state. availability = availability.write( top_index, packed_length * tf.ones((num_sequences,), dtype=INDEX_DTYPE)) empty_contents = tf.zeros((packed_length, num_sequences * 2), dtype=token_dtype) contents = contents.scatter(index_range, empty_contents) queue_index = top_index top_index = (top_index + 1) % queue_size pre_assign_availability = availability.read(queue_index) space_left = pre_assign_availability - lengths - spacing availability = availability.write(queue_index, space_left) # ============================================================================ # == Update contents ========================================================= # ============================================================================ # Consider the following case for a seq-to-seq packing: # (padding is represented as underscores) # # Queue starting state: # [1, 3, 2, 4, 6, 1, _, _, _, _, _, ...] # [5, 9, _, _, _, _, _, _, _, _, _, ...] # # Examples: # [4, 2, 4], [3] # # Desired new queue state: # [1, 3, 2, 4, 6, 1, _, _, 4, 2, 4, _, _, ...] # [5, 9, _, _, 3, _, _, _, _, _, _, _, _, ...] # # This could be acomplished by creating a TensorArray for each of the two # sequences, and scattering into the respective arrays. However TensorArray # writes are extremely expensive relative to other operations. So instead we # store the contents in a single TensorArray of shape (packed_length, 2), and # we pad and concatenate the examples such that they can be added in a single # assign: # # [_, _, _, _, 4, 2, 4] # [3, _, _, _, _, _, _] # + # [1, 3, 2, 4, 6, 1, _, _, _, _, _, ...] # [5, 9, _, _, _, _, _, _, _, _, _, ...] # # And in practice, the extra work of padding is neglidgable compared to # the gain from vectorizing the TensorArray assign. We also store a bit mask # denoting where sequences start which is used to compute segment and # position metadata: # # [_, _, _, _, 1, _, _] # [1, _, _, _, _, _, _] # + # [1, _, _, _, _, _, _, _, _, _, _, ...] # [1, _, _, _, _, _, _, _, _, _, _, ...] # # Both the contents and the mask are concatenated in the same TensorArray # for performance. start_index = packed_length - pre_assign_availability end_index = start_index + lengths leftmost = tf.reduce_min(start_index, axis=0) rightmost = tf.reduce_max(end_index, axis=0) delta = rightmost - leftmost pad_indices = [tf.stack((start_index[i] - leftmost, rightmost - end_index[i])) for i in range(num_sequences)] padded_examples = [tf.pad(ex, padding[tf.newaxis, :]) for ex, padding in zip(example, pad_indices)] padded_examples = tf.transpose(tf.stack(padded_examples)) mask_update = tf.one_hot(start_index - leftmost, delta, dtype=contents.dtype, axis=0) content_update = tf.concat([padded_examples, mask_update], axis=1) index_range = (queue_index * packed_length + # Offset into the right section. tf.range(delta, dtype=INDEX_DTYPE) + leftmost) contents = contents.scatter(index_range, contents.gather(index_range) + content_update) state = (availability, contents, top_index) return state, (tf.logical_not(any_can_fit), output_contents)
def _forward(self, input_indxs, outpt_indxs, scores, weights): """Build the graph for the forward pass. Args: input_indxs: int32 or int64 tensor for input labels outpt_indxs: int32 or int64 tensor for outpt labels scores: float32 tensor for co-occurrence score weights: float32 tensor for loss weights Returns: loss: a univariate tensor giving the loss from the batch """ # Initialize input/outpt word (node) parameters self._default_scope = tf.get_variable_scope() init_width = 0.5 / (self._vector_size + self._covariate_size) self._word['input'] = self._weight_initializer('word_input', init_width, self._vocab_size, self._vector_size) self._word['outpt'] = self._weight_initializer('word_outpt', init_width, self._vocab_size, self._vector_size) # Initialize input/outpt bias parameters self._bias['input'] = self._weight_initializer('bias_input', init_width, self._vocab_size, 1) self._bias['outpt'] = self._weight_initializer('bias_outpt', init_width, self._vocab_size, 1) if self._covariate_size > 0: # Initialize input/outpt cvrt transformation parameters self._cvrt_transformation['input'] = self._weight_initializer( 'cvrt_input', init_width, self._covariate_data.shape[1], self._covariate_size) self._cvrt_transformation['outpt'] = self._weight_initializer( 'cvrt_outpt', init_width, self._covariate_data.shape[1], self._covariate_size) # Project the covariate data with the transformation parameters self._cvrt['input'] = tf.matmul(self._covariate_data_tensor, self._cvrt_transformation['input']) self._cvrt['outpt'] = tf.matmul(self._covariate_data_tensor, self._cvrt_transformation['outpt']) if self._use_monet: # Compute covariate svd _, self._u, _ = tf.linalg.svd(self._cvrt['input'] + self._cvrt['outpt']) # Project base word vecs and get word vecs self._projected_word_input = tf.stop_gradient( self._word['input'] - self._db_level * tf.matmul( self._u, tf.matmul(tf.transpose(self._u), self._word['input']))) self._projected_word_outpt = tf.stop_gradient( self._word['outpt'] - self._db_level * tf.matmul( self._u, tf.matmul(tf.transpose(self._u), self._word['outpt']))) # Get loss input word vectors if self._use_monet: self._input_word_vecs = tf.nn.embedding_lookup( self._projected_word_input, input_indxs) self._outpt_word_vecs = tf.nn.embedding_lookup( self._projected_word_outpt, outpt_indxs) else: self._input_word_vecs = tf.nn.embedding_lookup( self._word['input'], input_indxs) self._outpt_word_vecs = tf.nn.embedding_lookup( self._word['outpt'], outpt_indxs) # Get loss input bias vectors self._input_bias_vecs = tf.nn.embedding_lookup(self._bias['input'], input_indxs) self._outpt_bias_vecs = tf.nn.embedding_lookup(self._bias['outpt'], outpt_indxs) self._word_pred = tf.reduce_sum(tf.multiply(self._input_word_vecs, self._outpt_word_vecs), axis=1) self._bias_pred = tf.reduce_sum(self._input_bias_vecs + self._outpt_bias_vecs, axis=1) estimated_score = self._bias_pred self._word_pred = tf.reduce_sum(tf.multiply(self._input_word_vecs, self._outpt_word_vecs), axis=1) estimated_score += self._word_pred # Add covariate terms if self._covariate_size > 0: self._input_cvrt_vecs = tf.nn.embedding_lookup( self._cvrt['input'], input_indxs) self._outpt_cvrt_vecs = tf.nn.embedding_lookup( self._cvrt['outpt'], outpt_indxs) self._cvrt_pred = tf.reduce_sum(tf.multiply( self._input_cvrt_vecs, self._outpt_cvrt_vecs), axis=1) estimated_score += self._cvrt_pred else: self._cvrt_pred = tf.constant(0.0) self._scores = scores self._est_score = estimated_score if self._use_w2v: loss = self._compute_w2v_loss(input_indxs) else: diff = estimated_score - scores self._diff = diff loss = tf.reduce_sum(tf.multiply(weights, tf.square(diff))) / 2 return loss
def get_reg_loss(tfs): # Regulizer with tf.name_scope('reg_errors'): reg_loss = tfs.loss # amplitude if 'amplitude' in tfs.sys_para.reg_coeffs: amp_reg_alpha_coeff = tfs.sys_para.reg_coeffs['amplitude'] amp_reg_alpha = amp_reg_alpha_coeff / float(tfs.sys_para.steps) reg_loss = reg_loss + amp_reg_alpha * tf.nn.l2_loss(tfs.ops_weight) # gaussian envelope if 'envelope' in tfs.sys_para.reg_coeffs: reg_alpha_coeff = tfs.sys_para.reg_coeffs['envelope'] reg_alpha = reg_alpha_coeff / float(tfs.sys_para.steps) reg_loss = reg_loss + reg_alpha * tf.nn.l2_loss( tf.multiply(tfs.tf_one_minus_gaussian_envelope, tfs.ops_weight)) # Limiting the dwdt of control pulse if 'dwdt' in tfs.sys_para.reg_coeffs: zeros_for_training = tf.zeros([tfs.sys_para.ops_len, 2]) new_weights = tf.concat([tfs.ops_weight, zeros_for_training], 1) new_weights = tf.concat([zeros_for_training, new_weights], 1) dwdt_reg_alpha_coeff = tfs.sys_para.reg_coeffs['dwdt'] dwdt_reg_alpha = dwdt_reg_alpha_coeff / float(tfs.sys_para.steps) reg_loss = reg_loss + dwdt_reg_alpha * tf.nn.l2_loss( (new_weights[:, 1:] - new_weights[:, :tfs.sys_para.steps + 3]) / tfs.sys_para.dt) # Limiting the d2wdt2 of control pulse if 'd2wdt2' in tfs.sys_para.reg_coeffs: d2wdt2_reg_alpha_coeff = tfs.sys_para.reg_coeffs['d2wdt2'] d2wdt2_reg_alpha = d2wdt2_reg_alpha_coeff / float( tfs.sys_para.steps) reg_loss = reg_loss + d2wdt2_reg_alpha * tf.nn.l2_loss((new_weights[:, 2:] - \ 2 * new_weights[:, 1:tfs.sys_para.steps + 3] + new_weights[:, :tfs.sys_para.steps + 2]) / ( tfs.sys_para.dt ** 2)) # bandpass filter on the control if 'bandpass' in tfs.sys_para.reg_coeffs: ## currently does not support bandpass reg for CPU (no CPU kernel for FFT) if not tfs.sys_para.use_gpu: raise ValueError( 'currently does not support bandpass reg for CPU (no CPU kernel for FFT)' ) bandpass_reg_alpha_coeff = tfs.sys_para.reg_coeffs['bandpass'] bandpass_reg_alpha = bandpass_reg_alpha_coeff / float( tfs.sys_para.steps) tf_u = tf.cast(tfs.ops_weight, dtype=tf.complex64) tf_fft = tf.complex_abs(tf.fft(tf_u)) band = np.array(tfs.sys_para.reg_coeffs['band']) band_id = (band * tfs.sys_para.total_time).astype(int) half_id = int(tfs.sys_para.steps / 2) fft_loss = bandpass_reg_alpha * ( tf.reduce_sum(tf_fft[:, 0:band_id[0]]) + tf.reduce_sum(tf_fft[:, band_id[1]:half_id])) reg_loss = reg_loss + fft_loss # Limiting the access to forbidden states if 'forbidden_coeff_list' in tfs.sys_para.reg_coeffs: if tfs.sys_para.is_dressed: v_sorted = tf.constant(c_to_r_mat( np.reshape( sort_ev(tfs.sys_para.v_c, tfs.sys_para.dressed_id), [ len(tfs.sys_para.dressed_id), len(tfs.sys_para.dressed_id) ])), dtype=tf.float32) for inter_vec in tfs.inter_vecs: if tfs.sys_para.is_dressed and ( 'forbid_dressed' in tfs.sys_para.reg_coeffs and tfs.sys_para.reg_coeffs['forbid_dressed']): inter_vec = tf.matmul(tf.transpose(v_sorted), inter_vec) for inter_reg_alpha_coeff, state in zip( tfs.sys_para.reg_coeffs['forbidden_coeff_list'], tfs.sys_para.reg_coeffs['states_forbidden_list']): inter_reg_alpha = inter_reg_alpha_coeff / float( tfs.sys_para.steps) forbidden_state_pop = tf.square(inter_vec[state, :]) + \ tf.square(inter_vec[tfs.sys_para.state_num + state, :]) reg_loss = reg_loss + inter_reg_alpha * tf.nn.l2_loss( forbidden_state_pop) # Speeding up the gate time if 'speed_up' in tfs.sys_para.reg_coeffs: speed_up_reg_alpha_coeff = tfs.sys_para.reg_coeffs['speed_up'] speed_up_reg_alpha = speed_up_reg_alpha_coeff / float( tfs.sys_para.steps) target_vecs_all_timestep = tf.tile( tf.reshape( tfs.target_vecs, [2 * tfs.sys_para.state_num, 1, len(tfs.inter_vecs)]), [1, tfs.sys_para.steps + 1, 1]) target_vecs_inner_product = tfs.get_inner_product_3D( tfs.inter_vecs_packed, target_vecs_all_timestep) reg_loss = reg_loss + speed_up_reg_alpha * tf.nn.l2_loss( tfs.sys_para.steps + 1 - target_vecs_inner_product) return reg_loss
def compute_cost(Z3,Y): logits=tf.transpose(Z3) labels=tf.transpose(Y) cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=labels)) return cost
def model_fn(features, labels, mode, params): """The model_fn to be used with TPUEstimator. Args: features: `Tensor` of batched images. labels: `Tensor` of one hot labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features['feature'] # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. TPU uses XLA compiler to figure out best layout. if FLAGS.data_format == 'channels_first': assert not FLAGS.transpose_input # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) stats_shape = [3, 1, 1] else: stats_shape = [1, 1, 3] input_image_size = FLAGS.input_image_size if not input_image_size: input_image_size = model_builder_factory.get_model_input_size( FLAGS.model_name) if FLAGS.transpose_input and mode != tf.estimator.ModeKeys.PREDICT: features = tf.reshape(features, [input_image_size, input_image_size, 3, -1]) features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC is_training = (mode == tf.estimator.ModeKeys.TRAIN) has_moving_average_decay = (FLAGS.moving_average_decay > 0) # This is essential, if using a keras-derived model. tf.keras.backend.set_learning_phase(is_training) logging.info('Using open-source implementation.') override_params = {} if FLAGS.batch_norm_momentum is not None: override_params['batch_norm_momentum'] = FLAGS.batch_norm_momentum if FLAGS.batch_norm_epsilon is not None: override_params['batch_norm_epsilon'] = FLAGS.batch_norm_epsilon if FLAGS.dropout_rate is not None: override_params['dropout_rate'] = FLAGS.dropout_rate if FLAGS.survival_prob is not None: override_params['survival_prob'] = FLAGS.survival_prob if FLAGS.data_format: override_params['data_format'] = FLAGS.data_format if FLAGS.num_label_classes: override_params['num_classes'] = FLAGS.num_label_classes if FLAGS.depth_coefficient: override_params['depth_coefficient'] = FLAGS.depth_coefficient if FLAGS.width_coefficient: override_params['width_coefficient'] = FLAGS.width_coefficient def normalize_features(features, mean_rgb, stddev_rgb): """Normalize the image given the means and stddevs.""" features -= tf.constant(mean_rgb, shape=stats_shape, dtype=features.dtype) features /= tf.constant(stddev_rgb, shape=stats_shape, dtype=features.dtype) return features def build_model(): """Build model using the model_name given through the command line.""" model_builder = model_builder_factory.get_model_builder( FLAGS.model_name) normalized_features = normalize_features(features, model_builder.MEAN_RGB, model_builder.STDDEV_RGB) logits, _ = model_builder.build_model(normalized_features, model_name=FLAGS.model_name, training=is_training, override_params=override_params, model_dir=FLAGS.model_dir) return logits if params['use_bfloat16']: with tf.tpu.bfloat16_scope(): logits = tf.cast(build_model(), tf.float32) else: logits = build_model() if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=labels, label_smoothing=FLAGS.label_smoothing) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + FLAGS.weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) global_step = tf.train.get_global_step() if has_moving_average_decay: ema = tf.train.ExponentialMovingAverage( decay=FLAGS.moving_average_decay, num_updates=global_step) ema_vars = utils.get_ema_vars() host_call = None restore_vars_dict = None if is_training: # Compute the current epoch and associated learning rate from global_step. current_epoch = (tf.cast(global_step, tf.float32) / params['steps_per_epoch']) scaled_lr = FLAGS.base_learning_rate * (FLAGS.train_batch_size / 256.0) logging.info('base_learning_rate = %f', FLAGS.base_learning_rate) learning_rate = utils.build_learning_rate( scaled_lr, global_step, params['steps_per_epoch'], decay_epochs=FLAGS.lr_decay_epoch, warmup_epochs=FLAGS.lr_warmup_epochs, decay_factor=FLAGS.lr_decay_factor, lr_decay_type=FLAGS.lr_schedule, total_steps=FLAGS.train_steps) optimizer = utils.build_optimizer( learning_rate, optimizer_name=FLAGS.optimizer, lars_weight_decay=FLAGS.lars_weight_decay, lars_epsilon=FLAGS.lars_epsilon) if FLAGS.use_tpu: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if has_moving_average_decay: with tf.control_dependencies([train_op]): train_op = ema.apply(ema_vars) if not FLAGS.skip_host_call: def host_call_fn(gs, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] # Host call fns are executed FLAGS.iterations_per_loop times after one # TPU loop is finished, setting max_queue value to the same as number of # iterations will make the summary writer only flush the data to storage # once per loop. with tf2.summary.create_file_writer( FLAGS.model_dir, max_queue=FLAGS.iterations_per_loop).as_default(): with tf2.summary.record_if(True): tf2.summary.scalar('learning_rate', lr[0], step=gs) tf2.summary.scalar('current_epoch', ce[0], step=gs) return tf.summary.all_v2_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [gs_t, lr_t, ce_t]) else: train_op = None if has_moving_average_decay: # Load moving average variables for eval. restore_vars_dict = ema.variables_to_restore(ema_vars) eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/estimator/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch, num_classes]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ labels = tf.argmax(labels, axis=1) predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) num_params = np.sum([np.prod(v.shape) for v in tf.trainable_variables()]) logging.info('number of trainable parameters: %d', num_params) def _scaffold_fn(): saver = tf.train.Saver(restore_vars_dict) return tf.train.Scaffold(saver=saver) if has_moving_average_decay and not is_training: # Only apply scaffold for eval jobs. scaffold_fn = _scaffold_fn else: scaffold_fn = None return tf.estimator.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
# Plot errors with respect to number of epochs plt.plot(errors) plt.ylabel('Root Mean Squared Error') plt.xlabel('Number of Epochs') plt.savefig('pics/result.png') # We can now predict movies that an arbitrarily selected user might like by feeding in the user's watched # movie preferences into the RBM and then reconstructing the input # Selecting the input user inputUser = [trX[850]] # Feed in the user and reconstructing the input hh0 = tf.nn.sigmoid(tf.matmul(v0, W) + hb) vv1 = tf.nn.sigmoid(tf.matmul(hh0, tf.transpose(W)) + vb) feed = sess.run(hh0, feed_dict={v0: inputUser, W: prv_w, hb: prv_hb}) rec = sess.run(vv1, feed_dict={hh0: feed, W: prv_w, vb: prv_vb}) # We can then list the 25 most recommended movies for our mock user by sorting it by their scores given by our model scored_movies_df_850 = movies_df scored_movies_df_850["Recommendation Score"] = rec[0] print("\n") print(scored_movies_df_850.sort_values(["Recommendation Score"], ascending=False).head(25)) # Now we recommend some movies that the user has not yet watched print("\n") print(merged_df.iloc[850]) # Now we can find all the movies that our mock user has watched before movies_df_850 = merged_df[merged_df['UserID'] == 2562]
def attention_layer(from_tensor, to_tensor, attention_mask=None, num_attention_heads=1, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, batch_size=None, from_seq_length=None, to_seq_length=None): """Performs multi-headed attention from `from_tensor` to `to_tensor`. Args: from_tensor: float Tensor of shape [batch_size, from_seq_length, from_width]. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. attention_mask: (optional) int32 Tensor of shape [batch_size, from_seq_length, to_seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. num_attention_heads: int. Number of attention heads. query_act: (optional) Activation function for the query transform. key_act: (optional) Activation function for the key transform. value_act: (optional) Activation function for the value transform. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. initializer_range: float. Range of the weight initializer. batch_size: (Optional) int. If the input is 2D, this might be the batch size of the 3D version of the `from_tensor` and `to_tensor`. from_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `from_tensor`. to_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `to_tensor`. Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads, size_per_head]. Raises: ValueError: Any of the arguments or tensor shapes are invalid. """ from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) size_per_head = int(from_shape[2] / num_attention_heads) if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`.") if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: if (batch_size is None or from_seq_length is None or to_seq_length is None): raise ValueError( "When passing in rank 2 tensors to attention_layer, the values " "for `batch_size`, `from_seq_length`, and `to_seq_length` " "must all be specified.") # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = `from_tensor` sequence length # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` # `query_layer` = [B, F, N, H] q = dense_layer_3d(from_tensor, num_attention_heads, size_per_head, create_initializer(initializer_range), query_act, "query") # `key_layer` = [B, T, N, H] k = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, create_initializer(initializer_range), key_act, "key") # `value_layer` = [B, T, N, H] v = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, create_initializer(initializer_range), value_act, "value") q = tf.transpose(q, [0, 2, 1, 3]) k = tf.transpose(k, [0, 2, 1, 3]) v = tf.transpose(v, [0, 2, 1, 3]) if attention_mask is not None: attention_mask = tf.reshape(attention_mask, [batch_size, 1, to_seq_length, 1]) # 'new_embeddings = [B, N, F, H]' new_embeddings = dot_product_attention(q, k, v, attention_mask, attention_probs_dropout_prob) return tf.transpose(new_embeddings, [0, 2, 1, 3])
def _model_output(inputs, data_format): """Maybe convert from channels_first (NCHW) back to channels_last (NHWC).""" if data_format == 'channels_first': return tf.transpose(a=inputs, perm=[0, 2, 3, 1]) else: return inputs
def output(self): with tf.name_scope('CrossEntropyLoss'): l2_norm = tf.add_n([ tf.nn.l2_loss(self.item_list_emb), tf.nn.l2_loss(self.category_list_emb), tf.nn.l2_loss(self.position_list_emb), tf.nn.l2_loss(self.user_embedding), tf.nn.l2_loss(self.reconsume_lst_embedding) ]) regulation_rate = self.FLAGS.regulation_rate item_lookup_table_T = tf.transpose(self.embedding.item_emb_lookup_table) ''' self.output_w = variable_scope.get_variable("output_w", shape=[self.num_units, self.num_units], dtype=self.predict_behavior_emb.dtype) logits = tf.matmul(self.predict_behavior_emb, self.output_w) ''' logits = tf.matmul(self.predict_behavior_emb, item_lookup_table_T) row_idx = tf.reshape(tf.range(0, self.now_bacth_data_size, delta=1), [-1, 1]) row_idx = tf.tile(row_idx, [1, self.max_len]) row_idx = tf.reshape(row_idx, [-1, 1]) masks = tf.sequence_mask(self.seq_length, maxlen=self.max_len) mask_item_list = tf.where(masks, self.item_list, (1 - tf.to_int32(masks)) * self.embedding.item_count) col_idx = tf.reshape(mask_item_list, [-1, 1]) reconsume_scores = tf.sparse_to_dense(sparse_indices=tf.concat([row_idx, col_idx], axis=1), sparse_values=tf.reshape(self.reconsume_scores, [-1, ]), output_shape=(self.now_bacth_data_size, self.embedding.item_count+3), validate_indices=False) # TODO 重新打分 predict_is_reconsume = tf.expand_dims(self.predict_is_reconsume, axis=-1) logits = logits + predict_is_reconsume * reconsume_scores self.item_result = logits # TODO for speed self.indices1 = tf.nn.top_k(self.item_result, 1).indices self.indices5 = tf.nn.top_k(self.item_result, 5).indices self.indices10 = tf.nn.top_k(self.item_result, 10).indices self.indices30 = tf.nn.top_k(self.item_result, 30).indices self.indices50 = tf.nn.top_k(self.item_result, 50).indices log_probs = tf.nn.log_softmax(logits) label_ids = tf.reshape(self.target[0], [-1]) one_hot_labels = tf.one_hot( label_ids, depth=self.embedding.item_count + 3, dtype=tf.float32) self.loss_origin = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) """ loss reconsume """ predict_is_reconsume = tf.reshape(self.predict_is_reconsume,[-1,1]) predict_is_reconsume = tf.concat([1-predict_is_reconsume,predict_is_reconsume],axis=-1) reconsume_labels = tf.one_hot( tf.to_int32(self.is_reconsume), depth = 2 , dtype=tf.float32) self.loss_reconsume = tf.nn.softmax_cross_entropy_with_logits(labels = reconsume_labels,logits=predict_is_reconsume) predictions = tf.argmax(predict_is_reconsume,axis=-1,output_type=tf.int32) self.precision = tf.metrics.precision(labels=self.is_reconsume,predictions=predictions) self.recall = tf.metrics.recall(labels=self.is_reconsume,predictions=predictions) self.loss = regulation_rate * l2_norm + tf.reduce_mean(self.loss_origin) +\ tf.reduce_mean(self.loss_reconsume) # self.loss = regulation_rate * l2_norm + tf.reduce_mean(self.loss_origin) # tf.summary.scalar('l2_norm', l2_norm) tf.summary.scalar('Training Cross Entropy Loss', tf.reduce_mean(self.loss_origin)) tf.summary.scalar('Training Reconsume Loss', tf.reduce_mean(self.loss_reconsume)) tf.summary.scalar('normalized Training Loss', self.loss) tf.summary.scalar('l2_norm', l2_norm) tf.summary.scalar('Learning_rate', self.learning_rate) self.cal_gradient(tf.trainable_variables())
def add_metric_fn_inputs(params, cls_outputs, box_outputs, metric_fn_inputs, max_detection_points=anchors.MAX_DETECTION_POINTS): """Selects top-k predictions and adds the selected to metric_fn_inputs. Args: params: a parameter dictionary that includes `min_level`, `max_level`, `batch_size`, and `num_classes`. cls_outputs: an OrderDict with keys representing levels and values representing logits in [batch_size, height, width, num_anchors]. box_outputs: an OrderDict with keys representing levels and values representing box regression targets in [batch_size, height, width, num_anchors * 4]. metric_fn_inputs: a dictionary that will hold the top-k selections. max_detection_points: an integer specifing the maximum detection points to keep before NMS. Keep all anchors if max_detection_points <= 0. """ num_classes = params['num_classes'] cls_outputs_all = [] box_outputs_all = [] # Concatenates class and box of all levels into one tensor. for level in range(params['min_level'], params['max_level'] + 1): if params['data_format'] == 'channels_first': cls_outputs[level] = tf.transpose(cls_outputs[level], [0, 2, 3, 1]) box_outputs[level] = tf.transpose(box_outputs[level], [0, 2, 3, 1]) cls_outputs_all.append( tf.reshape(cls_outputs[level], [params['batch_size'], -1, num_classes])) box_outputs_all.append( tf.reshape(box_outputs[level], [params['batch_size'], -1, 4])) cls_outputs_all = tf.concat(cls_outputs_all, 1) box_outputs_all = tf.concat(box_outputs_all, 1) if max_detection_points > 0: # Prune anchors and detections to only keep max_detection_points. # Due to some issues, top_k is currently slow in graph model. cls_outputs_all_reshape = tf.reshape(cls_outputs_all, [params['batch_size'], -1]) _, cls_topk_indices = tf.math.top_k(cls_outputs_all_reshape, k=max_detection_points, sorted=False) indices = cls_topk_indices // num_classes classes = cls_topk_indices % num_classes cls_indices = tf.stack([indices, classes], axis=2) cls_outputs_all_after_topk = tf.gather_nd(cls_outputs_all, cls_indices, batch_dims=1) box_outputs_all_after_topk = tf.gather_nd(box_outputs_all, tf.expand_dims(indices, 2), batch_dims=1) else: # Keep all anchors, but for each anchor, just keep the max probablity for # each class. cls_outputs_idx = tf.math.argmax(cls_outputs_all, axis=-1) num_anchors = cls_outputs_all.shape[1] classes = cls_outputs_idx indices = tf.reshape( tf.tile(tf.range(num_anchors), [params['batch_size']]), [-1, num_anchors]) cls_outputs_all_after_topk = tf.reduce_max(cls_outputs_all, -1) box_outputs_all_after_topk = box_outputs_all metric_fn_inputs['cls_outputs_all'] = cls_outputs_all_after_topk metric_fn_inputs['box_outputs_all'] = box_outputs_all_after_topk metric_fn_inputs['indices_all'] = indices metric_fn_inputs['classes_all'] = classes