def attention_based_dropout(input_, option): def _get_importance_map(attention): return tf.sigmoid(attention) def _get_drop_mask(attention, drop_thr): max_val = tf.reduce_max(attention, axis=[1, 2, 3], keepdims=True) thr_val = max_val * drop_thr return tf.cast(attention < thr_val, dtype=tf.float32, name='drop_mask') def _select_component(importance_map, drop_mask, drop_prob): random_tensor = tf.random_uniform([], drop_prob, 1. + drop_prob) binary_tensor = tf.cast(tf.floor(random_tensor), dtype=tf.float32) return (1. - binary_tensor) * importance_map + binary_tensor * drop_mask ctx = get_current_tower_context() is_training = ctx.is_training drop_prob = 1 - option.adl_keep_prob drop_thr = option.adl_threshold if is_training: attention_map = tf.reduce_mean(input_, axis=1, keepdims=True) importance_map = _get_importance_map(attention_map) drop_mask = _get_drop_mask(attention_map, drop_thr) selected_map = _select_component(importance_map, drop_mask, drop_prob) output = input_ * selected_map return output else: return input_
def Dropout(x, *args, **kwargs): """ Same as `tf.layers.dropout`. However, for historical reasons, the first positional argument is interpreted as keep_prob rather than drop_prob. Explicitly use `rate=` keyword arguments to ensure things are consistent. """ if 'is_training' in kwargs: kwargs['training'] = kwargs.pop('is_training') if len(args) > 0: if args[0] != 0.5: logger.warn( "The first positional argument to tensorpack.Dropout is the probability to keep, rather than to drop. " "This is different from the rate argument in tf.layers.Dropout due to historical reasons. " "To mimic tf.layers.Dropout, explicitly use keyword argument 'rate' instead" ) rate = 1 - args[0] elif 'keep_prob' in kwargs: assert 'rate' not in kwargs, "Cannot set both keep_prob and rate!" rate = 1 - kwargs.pop('keep_prob') elif 'rate' in kwargs: rate = kwargs.pop('rate') else: rate = 0.5 if kwargs.get('training', None) is None: kwargs['training'] = get_current_tower_context().is_training if get_tf_version_tuple() <= (1, 12): return tf.layers.dropout(x, rate=rate, **kwargs) else: return tf.nn.dropout(x, rate=rate if kwargs['training'] else 0.)
def _build_graph(self, inputs): state, action, futurereward = inputs policy, self.value = self._get_NN_prediction(state) self.value = tf.squeeze(self.value, [1], name='pred_value') # (B,) self.logits = tf.nn.softmax(policy, name='logits') expf = tf.get_variable('explore_factor', shape=[], initializer=tf.constant_initializer(1), trainable=False) logitsT = tf.nn.softmax(policy * expf, name='logitsT') #The larger expf, the less exploration is_training = get_current_tower_context().is_training if not is_training: return log_probs = tf.log(self.logits + 1e-6) log_pi_a_given_s = tf.reduce_sum( log_probs * tf.one_hot(action, NUM_ACTIONS), 1) advantage = tf.sub(tf.stop_gradient(self.value), futurereward, name='advantage') policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage, name='policy_loss') xentropy_loss = tf.reduce_sum( self.logits * log_probs, name='xentropy_loss') value_loss = tf.nn.l2_loss(self.value - futurereward, name='value_loss') pred_reward = tf.reduce_mean(self.value, name='predict_reward') advantage = symbf.rms(advantage, name='rms_advantage') summary.add_moving_summary(policy_loss, xentropy_loss, value_loss, pred_reward, advantage) entropy_beta = tf.get_variable('entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) self.cost = tf.add_n([policy_loss, xentropy_loss * entropy_beta, value_loss]) self.cost = tf.truediv(self.cost, tf.cast(tf.shape(futurereward)[0], tf.float32), name='cost')
def generate_fpn_proposals(multilevel_pred_boxes, multilevel_label_logits, image_shape2d): """ Args: multilevel_pred_boxes: #lvl HxWxAx4 boxes multilevel_label_logits: #lvl tensors of shape HxWxA Returns: boxes: kx4 float scores: k logits """ num_lvl = len(cfg.FPN.ANCHOR_STRIDES) assert len(multilevel_pred_boxes) == num_lvl assert len(multilevel_label_logits) == num_lvl training = get_current_tower_context().is_training all_boxes = [] all_scores = [] if cfg.FPN.PROPOSAL_MODE == 'Level': fpn_nms_topk = cfg.RPN.TRAIN_PER_LEVEL_NMS_TOPK if training else cfg.RPN.TEST_PER_LEVEL_NMS_TOPK for lvl in range(num_lvl): with tf.name_scope('Lvl{}'.format(lvl + 2)): pred_boxes_decoded = multilevel_pred_boxes[lvl] proposal_boxes, proposal_scores = generate_rpn_proposals( tf.reshape(pred_boxes_decoded, [-1, 4]), tf.reshape(multilevel_label_logits[lvl], [-1]), image_shape2d, fpn_nms_topk) all_boxes.append(proposal_boxes) all_scores.append(proposal_scores) proposal_boxes = tf.concat(all_boxes, axis=0) # nx4 proposal_scores = tf.concat(all_scores, axis=0) # n # Here we are different from Detectron. # Detectron picks top-k within the batch, rather than within an image. However we do not have a batch. proposal_topk = tf.minimum(tf.size(proposal_scores), fpn_nms_topk) proposal_scores, topk_indices = tf.nn.top_k(proposal_scores, k=proposal_topk, sorted=False) proposal_boxes = tf.gather(proposal_boxes, topk_indices, name="all_proposals") else: for lvl in range(num_lvl): with tf.name_scope('Lvl{}'.format(lvl + 2)): pred_boxes_decoded = multilevel_pred_boxes[lvl] all_boxes.append(tf.reshape(pred_boxes_decoded, [-1, 4])) all_scores.append( tf.reshape(multilevel_label_logits[lvl], [-1])) all_boxes = tf.concat(all_boxes, axis=0) all_scores = tf.concat(all_scores, axis=0) proposal_boxes, proposal_scores = generate_rpn_proposals( all_boxes, all_scores, image_shape2d, cfg.RPN.TRAIN_PRE_NMS_TOPK if training else cfg.RPN.TEST_PRE_NMS_TOPK, cfg.RPN.TRAIN_POST_NMS_TOPK if training else cfg.RPN.TEST_POST_NMS_TOPK) tf.sigmoid(proposal_scores, name='probs') # for visualization return tf.stop_gradient(proposal_boxes, name='boxes'), \ tf.stop_gradient(proposal_scores, name='scores')
def build_graph(self, image, label, bbox): ctx = get_current_tower_context() is_training = ctx.is_training image = image_preprocess(image, args, bgr=True) image = tf.transpose(image, [0, 3, 1, 2]) # NCHW label_onehot = tf.one_hot(label, args.classnum) image_summaries('input-images', image) logits, convmaps = vgg_gap(image, args) _, indices = tf.nn.top_k(logits, 5) indices = tf.identity(indices, name='top5') # Grad-CAM activation_map = tf.identity(tf.cast(convmaps, tf.float32), name='actmap') y_c = tf.reduce_sum(tf.multiply(logits, label_onehot), axis=1) target_conv_layer_grad = tf.identity(tf.cast( tf.gradients(y_c, convmaps)[0], tf.float32), name='grad') # Compute loss loss = compute_loss_and_error(logits, label) wd_cost = regularize_cost('.*/W', l2_regularizer(5e-4), name='l2_regularize_loss') add_moving_summary(loss, wd_cost) return tf.add_n([loss, wd_cost], name='cost')
def generate_fpn_proposals(multilevel_anchors, multilevel_label_logits, multilevel_box_logits, image_shape2d): """ Args: multilevel_anchors: #lvl RPNAnchors multilevel_label_logits: #lvl tensors of shape HxWxA multilevel_box_logits: #lvl tensors of shape HxWxAx4 Returns: boxes: kx4 float scores: k logits """ num_lvl = len(cfg.FPN.ANCHOR_STRIDES) assert len(multilevel_anchors) == num_lvl assert len(multilevel_label_logits) == num_lvl assert len(multilevel_box_logits) == num_lvl ctx = get_current_tower_context() all_boxes = [] all_scores = [] if cfg.FPN.PROPOSAL_MODE == 'Level': fpn_nms_topk = cfg.RPN.TRAIN_PER_LEVEL_NMS_TOPK if ctx.is_training else cfg.RPN.TEST_PER_LEVEL_NMS_TOPK for lvl in range(num_lvl): with tf.name_scope('FPNProposal_Lvl{}'.format(lvl + 2)): anchors = multilevel_anchors[lvl] pred_boxes_decoded = anchors.decode_logits( multilevel_box_logits[lvl]) proposal_boxes, proposal_scores = generate_rpn_proposals( tf.reshape(pred_boxes_decoded, [-1, 4]), tf.reshape(multilevel_label_logits[lvl], [-1]), image_shape2d, fpn_nms_topk) all_boxes.append(proposal_boxes) all_scores.append(proposal_scores) proposal_boxes = tf.concat(all_boxes, axis=0) # nx4 proposal_scores = tf.concat(all_scores, axis=0) # n proposal_topk = tf.minimum(tf.size(proposal_scores), fpn_nms_topk) proposal_scores, topk_indices = tf.nn.top_k(proposal_scores, k=proposal_topk, sorted=False) proposal_boxes = tf.gather(proposal_boxes, topk_indices) else: for lvl in range(num_lvl): with tf.name_scope('FPNProposal_Lvl{}'.format(lvl + 2)): anchors = multilevel_anchors[lvl] pred_boxes_decoded = anchors.decode_logits( multilevel_box_logits[lvl]) all_boxes.append(tf.reshape(pred_boxes_decoded, [-1, 4])) all_scores.append( tf.reshape(multilevel_label_logits[lvl], [-1])) all_boxes = tf.concat(all_boxes, axis=0) all_scores = tf.concat(all_scores, axis=0) proposal_boxes, proposal_scores = generate_rpn_proposals( all_boxes, all_scores, image_shape2d, cfg.RPN.TRAIN_PRE_NMS_TOPK if ctx.is_training else cfg.RPN.TEST_PRE_NMS_TOPK, cfg.RPN.TRAIN_POST_NMS_TOPK if ctx.is_training else cfg.RPN.TEST_POST_NMS_TOPK) return proposal_boxes, proposal_scores
def build_graph(self, image, label): image = tf.expand_dims(image, 3) * 2 - 1 ctx = get_current_tower_context() M = get_keras_model() logits = M(image) if ctx.is_main_training_tower: for op in M.updates: tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, op) # build cost function by tensorflow cost = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=label) cost = tf.reduce_mean( cost, name='cross_entropy_loss') # the average cross-entropy loss # for tensorpack validation acc = tf.cast(tf.nn.in_top_k(logits, label, 1), tf.float32) acc = tf.reduce_mean(acc, name='accuracy') summary.add_moving_summary(acc) wd_cost = tf.add_n( M.losses, name='regularize_loss') # this is how Keras manage regularizers cost = tf.add_n([wd_cost, cost], name='total_cost') summary.add_moving_summary(cost, wd_cost) return cost
def addParamSummary(*summary_lists): """ Add summary Ops for all trainable variables matching the regex. Args: summary_lists (list): each is (regex, [list of summary type to perform]). Summary type can be 'mean', 'scalar', 'histogram', 'sparsity', 'rms' """ from tensorpack.tfutils.tower import get_current_tower_context from tensorpack.utils.develop import log_deprecated from tensorpack.tfutils.symbolic_functions import rms import re import tensorflow as tf ctx = get_current_tower_context() if ctx is not None and not ctx.is_main_training_tower: return if len(summary_lists) == 1 and isinstance(summary_lists[0], list): log_deprecated( text= "Use positional args to call add_param_summary() instead of a list." ) summary_lists = summary_lists[0] def perform(var, action): ndim = var.get_shape().ndims name = var.name.replace(':0', '') if action == 'scalar': assert ndim == 0, "Scalar summary on high-dimension data. Maybe you want 'mean'?" tf.summary.scalar(name, var) return assert ndim > 0, "Cannot perform {} summary on scalar data".format( action) if action == 'histogram': tf.summary.histogram(name, var) return if action == 'sparsity': tf.summary.scalar(name + '-sparsity', tf.nn.zero_fraction(var)) return if action == 'mean': tf.summary.scalar(name + '-mean', tf.reduce_mean(var)) return if action == 'rms': tf.summary.scalar(name + '-rms', rms(var)) return if action == 'absmax': tf.summary.scalar(name + '-absmax', tf.reduce_max(tf.abs(var))) return raise RuntimeError("Unknown summary type: {}".format(action)) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) with tf.name_scope('00/SummaryParam'): for p in params: name = p.name for rgx, actions in summary_lists: if not rgx.endswith('$'): rgx = rgx + '(:0)?$' if re.match(rgx, name): for act in actions: perform(p, act)
def vgg_gap(image, option, importance=False): ctx = get_current_tower_context() is_training = ctx.is_training with argscope(Conv2D, kernel_initializer=tf.variance_scaling_initializer(scale=2.)), \ argscope([Conv2D, MaxPooling, BatchNorm, GlobalAvgPooling], data_format='channels_first'): l = convnormrelu(image, 'conv1_1', 64, option) if option.attdrop[11]: l = ADL(11, l, option) l = convnormrelu(l, 'conv1_2', 64, option) if option.attdrop[12]: l = ADL(12, l, option) l = MaxPooling('pool1', l, 2) if option.attdrop[1]: l = ADL(1, l, option) l = convnormrelu(l, 'conv2_1', 128, option) if option.attdrop[21]: l = ADL(21, l, option) l = convnormrelu(l, 'conv2_2', 128, option) if option.attdrop[22]: l = ADL(21, l, option) l = MaxPooling('pool2', l, 2) if option.attdrop[2]: l = ADL(2, l, option) l = convnormrelu(l, 'conv3_1', 256, option) if option.attdrop[31]: l = ADL(31, l, option) l = convnormrelu(l, 'conv3_2', 256, option) if option.attdrop[32]: l = ADL(32, l, option) l = convnormrelu(l, 'conv3_3', 256, option) if option.attdrop[33]: l = ADL(33, l, option) l = MaxPooling('pool3', l, 2) if option.attdrop[3]: l = ADL(3, l, option) l = convnormrelu(l, 'conv4_1', 512, option) if option.attdrop[41]: l = ADL(41, l, option) l = convnormrelu(l, 'conv4_2', 512, option) if option.attdrop[42]: l = ADL(42, l, option) l = convnormrelu(l, 'conv4_3', 512, option) if option.attdrop[43]: l = ADL(43, l, option) l = MaxPooling('pool4', l, 2) if option.attdrop[4]: l = ADL(4, l, option) l = convnormrelu(l, 'conv5_1', 512, option) if option.attdrop[51]: l = ADL(51, l, option) l = convnormrelu(l, 'conv5_2', 512, option) if option.attdrop[52]: l = ADL(52, l, option) l = convnormrelu(l, 'conv5_3', 512, option) if option.attdrop[53]: l = ADL(53, l, option) convmaps = convnormrelu(l, 'new', 1024, option) if option.attdrop[6]: l = ADL(6, l, option) pre_logits = GlobalAvgPooling('gap', convmaps) logits = FullyConnected( 'linear', pre_logits, option.classnum, kernel_initializer=tf.random_normal_initializer(stddev=0.01)) return logits, convmaps
def generate_fpn_proposals( multilevel_anchors, multilevel_label_logits, multilevel_box_logits, image_shape2d): """ Args: multilevel_anchors: #lvl RPNAnchors multilevel_label_logits: #lvl tensors of shape HxWxA multilevel_box_logits: #lvl tensors of shape HxWxAx4 Returns: boxes: kx4 float scores: k logits """ num_lvl = len(cfg.FPN.ANCHOR_STRIDES) assert len(multilevel_anchors) == num_lvl assert len(multilevel_label_logits) == num_lvl assert len(multilevel_box_logits) == num_lvl ctx = get_current_tower_context() all_boxes = [] all_scores = [] if cfg.FPN.PROPOSAL_MODE == 'Level': fpn_nms_topk = cfg.RPN.TRAIN_PER_LEVEL_NMS_TOPK if ctx.is_training else cfg.RPN.TEST_PER_LEVEL_NMS_TOPK for lvl in range(num_lvl): with tf.name_scope('Lvl{}'.format(lvl + 2)): anchors = multilevel_anchors[lvl] pred_boxes_decoded = anchors.decode_logits(multilevel_box_logits[lvl]) proposal_boxes, proposal_scores = generate_rpn_proposals( tf.reshape(pred_boxes_decoded, [-1, 4]), tf.reshape(multilevel_label_logits[lvl], [-1]), image_shape2d, fpn_nms_topk) all_boxes.append(proposal_boxes) all_scores.append(proposal_scores) proposal_boxes = tf.concat(all_boxes, axis=0) # nx4 proposal_scores = tf.concat(all_scores, axis=0) # n proposal_topk = tf.minimum(tf.size(proposal_scores), fpn_nms_topk) proposal_scores, topk_indices = tf.nn.top_k(proposal_scores, k=proposal_topk, sorted=False) proposal_boxes = tf.gather(proposal_boxes, topk_indices) else: for lvl in range(num_lvl): with tf.name_scope('Lvl{}'.format(lvl + 2)): anchors = multilevel_anchors[lvl] pred_boxes_decoded = anchors.decode_logits(multilevel_box_logits[lvl]) all_boxes.append(tf.reshape(pred_boxes_decoded, [-1, 4])) all_scores.append(tf.reshape(multilevel_label_logits[lvl], [-1])) all_boxes = tf.concat(all_boxes, axis=0) all_scores = tf.concat(all_scores, axis=0) proposal_boxes, proposal_scores = generate_rpn_proposals( all_boxes, all_scores, image_shape2d, cfg.RPN.TRAIN_PRE_NMS_TOPK if ctx.is_training else cfg.RPN.TEST_PRE_NMS_TOPK, cfg.RPN.TRAIN_POST_NMS_TOPK if ctx.is_training else cfg.RPN.TEST_POST_NMS_TOPK) tf.sigmoid(proposal_scores, name='probs') # for visualization return tf.stop_gradient(proposal_boxes, name='boxes'), \ tf.stop_gradient(proposal_scores, name='scores')
def build_graph(self, image, label): is_training = get_current_tower_context().is_main_training_tower image_origin = ImageNetModel.image_preprocess( image, bgr=self.image_bgr) # [N, H, W, C] loss, logit = 0, {} scales = sorted(self.scales, reverse=True) # sorted_scales = sorted(list(set(scales + self.scales)), reverse=True) for scale in scales: image = tf.image.resize_images( image_origin, [scale, scale], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) if self.data_format == 'NCHW': image = tf.transpose(image, [0, 3, 1, 2]) with tf.variable_scope('imagenet', reuse=tf.AUTO_REUSE): logit[scale] = self.get_logits(image, scale) loss_scale = self.compute_loss_and_error(logit[scale], label, scale, is_training) loss += loss_scale if self.distill: logit_ensemble = 0 alpha = tf.get_variable('alpha', [len(scales)], initializer=tf.constant_initializer(1)) alpha_soft = tf.nn.softmax(alpha) # TODO: remove softmax for i, scale in enumerate(scales): logit_ensemble += alpha_soft[i] * tf.stop_gradient( logit[scale]) tf.summary.scalar('alpha%03d' % scale, alpha_soft[i]) loss_ensemble = self.compute_loss_and_error( logit_ensemble, label, 'ensemble', is_training) loss += loss_ensemble loss_distill = 0 soft_label = tf.stop_gradient(tf.nn.softmax(logit_ensemble)) for scale in scales: loss_distill += self.compute_distill_loss( logit[scale], soft_label) if DISTILL_TYPE == 'top-down': for i in range(len(scales) - 1): soft_label = tf.stop_gradient( tf.nn.softmax(logit[scales[i]])) for j in range(i + 1, len(scales)): loss_distill += self.compute_distill_loss( logit[scales[j]], soft_label) distill_num = len(scales) * (len(scales) + 1) / 2 loss += SOFTMAX_TEM**2 * loss_distill / distill_num * len( scales) else: loss += SOFTMAX_TEM**2 * loss_distill wd_loss = regularize_cost(self.weight_decay_pattern, tf.contrib.layers.l2_regularizer( self.weight_decay), name='l2_regularize_loss') add_moving_summary(loss, wd_loss) self.cost = tf.add_n([loss, wd_loss], name='cost') return self.cost
def tower_func(image, label): assert not get_current_tower_context().is_training image = self.image_preprocess(image) image = tf.transpose(image, [0, 3, 1, 2]) image, target_label = attacker.attack(image, label, self.get_logits) logits = self.get_logits(image) ImageNetModel.compute_loss_and_error( logits, label) # compute top-1 and top-5 AdvImageNetModel.compute_attack_success(logits, target_label)
def __init__(self, drop_path_keep_prob, max_train_steps, total_depth): self.max_train_steps = max_train_steps self.total_depth = total_depth self.is_training = get_current_tower_context().is_training self.drop_path_keep_prob = drop_path_keep_prob self.do_drop_path = ( self.is_training and self.drop_path_keep_prob is not None and self.drop_path_keep_prob < 1.0 )
def addParamSummary(*summary_lists): """ Add summary Ops for all trainable variables matching the regex. Args: summary_lists (list): each is (regex, [list of summary type to perform]). Summary type can be 'mean', 'scalar', 'histogram', 'sparsity', 'rms' """ from tensorpack.tfutils.tower import get_current_tower_context from tensorpack.utils.develop import log_deprecated from tensorpack.tfutils.symbolic_functions import rms import re import tensorflow as tf ctx = get_current_tower_context() if ctx is not None and not ctx.is_main_training_tower: return if len(summary_lists) == 1 and isinstance(summary_lists[0], list): log_deprecated(text="Use positional args to call add_param_summary() instead of a list.") summary_lists = summary_lists[0] def perform(var, action): ndim = var.get_shape().ndims name = var.name.replace(':0', '') if action == 'scalar': assert ndim == 0, "Scalar summary on high-dimension data. Maybe you want 'mean'?" tf.summary.scalar(name, var) return assert ndim > 0, "Cannot perform {} summary on scalar data".format(action) if action == 'histogram': tf.summary.histogram(name, var) return if action == 'sparsity': tf.summary.scalar(name + '-sparsity', tf.nn.zero_fraction(var)) return if action == 'mean': tf.summary.scalar(name + '-mean', tf.reduce_mean(var)) return if action == 'rms': tf.summary.scalar(name + '-rms', rms(var)) return if action == 'absmax': tf.summary.scalar(name + '-absmax', tf.reduce_max(tf.abs(var))) return raise RuntimeError("Unknown summary type: {}".format(action)) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) with tf.name_scope('00/SummaryParam'): for p in params: name = p.name for rgx, actions in summary_lists: if not rgx.endswith('$'): rgx = rgx + '(:0)?$' if re.match(rgx, name): for act in actions: perform(p, act)
def _get_logits_by_slim_model(self, inputs): ctx = get_current_tower_context() with tf.contrib.slim.arg_scope( resnet_v2.resnet_arg_scope(batch_norm_decay=0.9997)): logits, end_points = resnet_v2.resnet_v2_101( inputs, num_classes=None, is_training=ctx.is_training, global_pool=False, output_stride=16) net = end_points['resnet_v2_101/block4'] return net
def fn(): tlist = [] ctx = get_current_tower_context() assert ctx is not None assert len(self.shapes) == len(self._desc) for idx, p in enumerate(self._desc): tlist.append( tf.constant(0, dtype=p.type, name='dummy-{}-{}'.format(p.name, ctx.index), shape=self.shapes[idx])) return tlist
def _get_cost_and_grad(self): from tensorpack.tfutils.gradproc import FilterNoneGrad ctx = get_current_tower_context() assert ctx is not None and ctx.is_training, ctx # cost = self.get_cost() # assume single cost loss_policy, loss_value = self._cost opt_a, opt_v = self.get_optimizer() grads_a = opt_a.compute_gradients(loss_policy, var_list=self._weights_actor, colocate_gradients_with_ops=True) grads_a = FilterNoneGrad().process(grads_a) grads_v = opt_v.compute_gradients(loss_value, var_list=self._weights_critic, colocate_gradients_with_ops=True) grads_v = FilterNoneGrad().process(grads_v) return self._cost, [grads_a, grads_v]
def resnet(input_, DEPTH, option): ctx = get_current_tower_context() is_training = ctx.is_training mode = option.mode basicblock = preresnet_basicblock \ if mode == 'preact' else resnet_basicblock bottleneck = { 'resnet': resnet_bottleneck, 'preact': preresnet_bottleneck, 'se': se_resnet_bottleneck }[mode] cfg = { 18: ([2, 2, 2, 2], basicblock), 34: ([3, 4, 6, 3], basicblock), 50: ([3, 4, 6, 3], bottleneck), 101: ([3, 4, 23, 3], bottleneck), 152: ([3, 8, 36, 3], bottleneck) } defs, block_func = cfg[DEPTH] group_func = preresnet_group if mode == 'preact' else resnet_group with argscope(Conv2D, use_bias=False, kernel_initializer= \ tf.variance_scaling_initializer(scale=2.0, mode='fan_out')), \ argscope([Conv2D, MaxPooling, GlobalAvgPooling, BatchNorm], data_format='channels_first'): l = Conv2D('conv0', input_, 64, 7, strides=2, activation=BNReLU) # 112 if option.attdrop[0]: l = ADL(0, l, option) l = MaxPooling('pool0', l, 3, strides=2, padding='SAME') # 56 if option.attdrop[1]: l = ADL(1, l, option) l = group_func('group0', l, block_func, 64, defs[0], 1, option) # 56 if option.attdrop[2]: l = ADL(2, l, option) l = group_func('group1', l, block_func, 128, defs[1], 2, option) # 28 if option.attdrop[3]: l = ADL(3, l, option) l = group_func('group2', l, block_func, 256, defs[2], 2, option) # 14 if option.attdrop[4]: l = ADL(4, l, option) l = group_func('group3', l, block_func, 512, defs[3], option.laststride, option) # 7 if option.attdrop[5]: l = ADL(5, l, option) prelogits = GlobalAvgPooling('gap', l) logits = FullyConnected('linearnew', prelogits, option.classnum) return logits, l
def locked_dropout(self, x, keep_prob): """ Variational (locked) dropout. We make sure the drop-out mask is the same at all time steps. """ is_training = get_current_tower_context().is_training do_dropout = keep_prob is not None and keep_prob < 1.0 and is_training if not do_dropout: return x x_shape = x.get_shape().as_list() x_shape[self.t_dim] = 1 mask = tf.random_uniform(x_shape, minval=0, maxval=1, dtype=tf.float32) mask = tf.floor(mask + keep_prob) / keep_prob return tf.multiply(mask, x)
def _get_cost_and_grad(self): from tensorpack.tfutils.gradproc import FilterNoneGrad ctx = get_current_tower_context() assert ctx is not None and ctx.is_training, ctx # cost = self.get_cost() # assume single cost loss_policy, loss_value = self._cost opt_a, opt_v = self.get_optimizer() grads_a = opt_a.compute_gradients(loss_policy, var_list=self._weights_actor, colocate_gradients_with_ops=True) grads_a = FilterNoneGrad().process(grads_a) grads_v = opt_v.compute_gradients(loss_value, var_list=self._weights_critic, colocate_gradients_with_ops=True) grads_v = FilterNoneGrad().process(grads_v) return self._cost, [grads_a, grads_v]
def build_graph(self, image, label, xa, ya, xb, yb): image = image_preprocess( image, bgr=True) # image = (image - image_mean) / image_std label_onehot = tf.one_hot(label, 200) ctx = get_current_tower_context() isTrain = ctx.is_training cfg = { 18: ([2, 2, 2, 2]), 34: ([3, 4, 6, 3]), } defs = cfg[DEPTH] convmaps = Spec_Conv2D('conv0', image, 64, 7, stride=1, sn=args.sn) convmaps = batch_norm_resnet(convmaps, isTrain, 'bnfirst') convmaps = tf.nn.relu(convmaps, 'relufirst') #convmaps = MaxPooling('pool0', convmaps, 3, strides=2, padding='SAME') # 32x32 convmaps = preresnet_group('group0', convmaps, 64, defs[0], 1, isTrain, args.sn) # 32x32 convmaps = preresnet_group('group1', convmaps, 128, defs[1], 2, isTrain, args.sn) # 16x16 convmaps = preresnet_group('group2', convmaps, 256, defs[2], 2, isTrain, args.sn) # 8x8 convmaps_target = preresnet_group('group3new', convmaps, 512, defs[3], 1, isTrain, args.sn) convmaps_gap = tf.reduce_mean(convmaps_target, [1, 2], name='gap') logits, w = Spec_FullyConnected('linearnew', convmaps_gap, 200, sn=args.sn) weights = tf.identity(w, name='linearweight') activation_map = tf.identity(convmaps_target, name='actmap') y_c = tf.reduce_sum(tf.multiply(logits, label_onehot), axis=1) target_conv_layer_grad = tf.identity(tf.gradients( y_c, convmaps_target)[0], name='grad') loss = compute_loss_and_error(logits, label) wd_cost = regularize_cost('.*/W', l2_regularizer(1e-4), name='l2_regularize_loss') add_moving_summary(loss, wd_cost) return tf.add_n([loss, wd_cost], name='cost')
def dropout_embedding_w(self, w, keep_prob): """ Dropout for embedding matrix w. The idea is to ignore certain words completely at random """ is_training = get_current_tower_context().is_training do_dropout = keep_prob is not None and keep_prob < 1.0 and is_training if not do_dropout: return w # [n_vocab, nhid] w_shape = w.get_shape().as_list() mask = tf.random_uniform(shape=[w_shape[0], 1], minval=0, maxval=1, dtype=tf.float32) mask = tf.floor(mask + keep_prob) / keep_prob return tf.multiply(mask, w)
def build_graph(self, state, action, futurereward, action_prob): logits, value = self._get_NN_prediction(state) value = tf.squeeze(value, [1], name='pred_value') # (B,) policy = tf.nn.softmax(logits, name='policy') is_training = get_current_tower_context().is_training if not is_training: return log_probs = tf.log(policy + 1e-6) log_pi_a_given_s = tf.reduce_sum( log_probs * tf.one_hot(action, self.num_actions), 1) advantage = tf.subtract(tf.stop_gradient(value), futurereward, name='advantage') pi_a_given_s = tf.reduce_sum( policy * tf.one_hot(action, self.num_actions), 1) # (B,) importance = tf.stop_gradient( tf.clip_by_value(pi_a_given_s / (action_prob + 1e-8), 0, 10)) policy_loss = tf.reduce_sum(log_pi_a_given_s * advantage * importance, name='policy_loss') xentropy_loss = tf.reduce_sum(policy * log_probs, name='xentropy_loss') value_loss = tf.nn.l2_loss(value - futurereward, name='value_loss') pred_reward = tf.reduce_mean(value, name='predict_reward') advantage = tf.sqrt(tf.reduce_mean(tf.square(advantage)), name='rms_advantage') entropy_beta = tf.get_variable( 'entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) cost = tf.add_n( [policy_loss, xentropy_loss * entropy_beta, value_loss]) cost = tf.truediv(cost, tf.cast(tf.shape(futurereward)[0], tf.float32), name='cost') summary.add_moving_summary( policy_loss, xentropy_loss, value_loss, pred_reward, advantage, cost, tf.reduce_mean(importance, name='importance')) return cost
def get_bn_variables(n_out, use_scale, use_bias, beta_init, gamma_init): if use_bias: beta = tf.get_variable('beta', [n_out], initializer=beta_init) else: beta = tf.zeros([n_out], name='beta') if use_scale: gamma = tf.get_variable('gamma', [n_out], initializer=gamma_init) else: gamma = tf.ones([n_out], name='gamma') # x * gamma + beta moving_mean = tf.get_variable('mean/EMA', [n_out], initializer=tf.constant_initializer(), trainable=False) moving_var = tf.get_variable('variance/EMA', [n_out], initializer=tf.constant_initializer(1.0), trainable=False) if get_current_tower_context().is_main_training_tower: for v in [moving_mean, moving_var]: tf.add_to_collection(tf.GraphKeys.MODEL_VARIABLES, v) return beta, gamma, moving_mean, moving_var
def _basic_cell(self, initializer=None, hid_to_fs_params=None, l_hallu_costs=None): is_training = get_current_tower_context().is_training if is_training: h_mask = self.cell_mask(self.keep_prob_h) x_mask = self.cell_mask(self.keep_prob_x) else: h_mask = x_mask = None cell = PetridishRNNCell(num_units=self.num_units, layer_info_list=self.layer_info_list, num_proj=self.num_proj, hid_to_fs_params=hid_to_fs_params, l_hallu_costs=l_hallu_costs, initializer=initializer, data_format=self.data_format, compute_hallu_stats=self.compute_hallu_stats, h_mask=h_mask, x_mask=x_mask) self.cells.append(cell) return cell
def build_graph(self, image, label): """ The default tower function. """ image = self.image_preprocess(image) assert self.data_format == 'NCHW' image = tf.transpose(image, [0, 3, 1, 2]) ctx = get_current_tower_context() with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): # BatchNorm always comes with trouble. We use the testing mode of it during attack. with freeze_collection([tf.GraphKeys.UPDATE_OPS ]), argscope(BatchNorm, training=False): image, target_label = self.attacker.attack( image, label, self.get_logits) image = tf.stop_gradient(image, name='adv_training_sample') logits = self.get_logits(image) loss = ImageNetModel.compute_loss_and_error( logits, label, label_smoothing=self.label_smoothing) AdvImageNetModel.compute_attack_success(logits, target_label) if not ctx.is_training: return wd_loss = regularize_cost(self.weight_decay_pattern, tf.contrib.layers.l2_regularizer( self.weight_decay), name='l2_regularize_loss') add_moving_summary(loss, wd_loss) total_cost = tf.add_n([loss, wd_loss], name='cost') if self.loss_scale != 1.: logger.info("Scaling the total loss by {} ...".format( self.loss_scale)) return total_cost * self.loss_scale else: return total_cost
def regularize_cost_from_collection(name='regularize_cost'): """ Get the cost from the regularizers in ``tf.GraphKeys.REGULARIZATION_LOSSES``. If in replicated mode, will only regularize variables created within the current tower. Args: name (str): the name of the returned tensor Returns: tf.Tensor: a scalar, the total regularization cost. """ ctx = get_current_tower_context() if not ctx.is_training: # TODO Currently cannot build the wd_cost correctly at inference, # because ths vs_name used in inference can be '', therefore the # variable filter will fail return tf.constant(0, dtype=tf.float32, name='empty_' + name) # NOTE: this collection doesn't always grow with towers. # It only grows with actual variable creation, but not get_variable call. if ctx.has_own_variables: # be careful of the first tower (name='') losses = ctx.get_collection_in_tower( tfv1.GraphKeys.REGULARIZATION_LOSSES) else: losses = tfv1.get_collection(tfv1.GraphKeys.REGULARIZATION_LOSSES) if len(losses) > 0: logger.info("regularize_cost_from_collection() found {} regularizers " "in REGULARIZATION_LOSSES collection.".format(len(losses))) def maploss(l): assert l.dtype.is_floating, l if l.dtype != tf.float32: l = tf.cast(l, tf.float32) return l losses = [maploss(l) for l in losses] reg_loss = tf.add_n(losses, name=name) return reg_loss else: return tf.constant(0, dtype=tf.float32, name='empty_' + name)
def _get_NN_prediction(self, state): from tensorpack.tfutils import symbolic_functions ctx = get_current_tower_context() is_training = ctx.is_training l = state # l = tf.Print(l, [state], 'State = ') with tf.variable_scope('critic') as vs: from autodrive.model.selu import fc_selu for lidx in range(8): l = fc_selu(l, 200, keep_prob=1., # 由于我们只使用传感器训练,关键信息不能丢 is_training=is_training, name='fc-{}'.format(lidx)) # l = tf.layers.dense(l, 512, activation=tf.nn.relu, name='fc-dense') # for lidx, hidden_size in enumerate([300, 600]): # l = tf.layers.dense(l, hidden_size, activation=tf.nn.relu, name='fc-%d'%lidx) value = tf.layers.dense(l, 1, name='fc-value',\ kernel_initializer=tf.truncated_normal_initializer(stddev=0.1)) if not hasattr(self, '_weights_critic'): self._weights_critic = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) with tf.variable_scope('actor') as vs: l = tf.stop_gradient(l) mu_steering = 0.5 * tf.layers.dense(l, 1, activation=tf.nn.tanh, name='fc-mu-steering',\ kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) mu_accel = tf.layers.dense(l, 1, activation=tf.nn.tanh, name='fc-mu-accel',\ kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) mus = tf.concat([mu_steering, mu_accel], axis=-1) # mus = tf.layers.dense(l, 2, activation=tf.nn.tanh, name='fc-mus') # sigmas = tf.layers.dense(l, 2, activation=tf.nn.softplus, name='fc-sigmas') # sigmas = tf.clip_by_value(sigmas, -0.001, 0.5) sigma_steering_ = 0.5 * tf.layers.dense(l, 1, activation=tf.nn.sigmoid, name='fc-sigma-steering',\ kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) sigma_accel_ = 1. * tf.layers.dense(l, 1, activation=tf.nn.sigmoid, name='fc-sigma-accel',\ kernel_initializer=tf.truncated_normal_initializer(stddev=0.01)) # sigma_beta_steering = symbolic_functions.get_scalar_var('sigma_beta_steering', 0.3, summary=True, trainable=False) # sigma_beta_accel = symbolic_functions.get_scalar_var('sigma_beta_accel', 0.3, summary=True, trainable=False) from tensorpack.tfutils.common import get_global_step_var sigma_beta_steering_exp = tf.train.exponential_decay(0.001, get_global_step_var(), 1000, 0.5, name='sigma/beta/steering/exp') sigma_beta_accel_exp = tf.train.exponential_decay(0.5, get_global_step_var(), 5000, 0.5, name='sigma/beta/accel/exp') # sigma_steering = tf.minimum(sigma_steering_ + sigma_beta_steering, 0.5) # sigma_accel = tf.minimum(sigma_accel_ + sigma_beta_accel, 0.2) # sigma_steering = sigma_steering_ sigma_steering = (sigma_steering_ + sigma_beta_steering_exp) sigma_accel = (sigma_accel_ + sigma_beta_accel_exp) #* 0.1 # sigma_steering = sigma_steering_ # sigma_accel = sigma_accel_ sigmas = tf.concat([sigma_steering, sigma_accel], axis=-1) # sigma_steering = tf.clip_by_value(sigma_steering, 0.1, 0.5) # sigma_accel = tf.clip_by_value(sigma_accel, 0.1, 0.5) # sigmas = sigmas_orig + 0.001 # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5) # sigma_beta = tf.get_variable('sigma_beta', shape=[], dtype=tf.float32, # initializer=tf.constant_initializer(.5), trainable=False) # if is_training: # pass # # 如果不加sigma_beta,收敛会很慢,并且不稳定,猜测可能是以下原因: # # 1、训练前期尽量大的探索可以避免网络陷入局部最优 # # 2、前期过小的sigma会使normal_dist的log_prob过大,导致梯度更新过大,网络一开始就畸形了,很难恢复回来 # # if is_training: # sigmas += sigma_beta_steering # sigma_steering = tf.clip_by_value(sigma_steering, sigma_beta_steering, 0.5) # sigma_accel = tf.clip_by_value(sigma_accel, sigma_beta_accel, 0.5) # sigmas = tf.clip_by_value(sigmas, 0.1, 0.5) # sigmas_orig = sigmas # sigmas = sigmas + sigma_beta_steering # sigmas = tf.minimum(sigmas + 0.1, 100) # sigmas = tf.clip_by_value(sigmas, sigma_beta_steering, 1) # sigma_steering += sigma_beta_steering # sigma_accel += sigma_beta_accel # mus = tf.concat([mu_steering, mu_accel], axis=-1) from tensorflow.contrib.distributions import Normal dists = Normal(mus, sigmas+1e-3) actions = tf.squeeze(dists.sample([1]), [0]) # 裁剪到一倍方差之内 # actions = tf.clip_by_value(actions, -1., 1.) if is_training: summary.add_moving_summary(tf.reduce_mean(mu_steering, name='mu/steering/mean'), tf.reduce_mean(mu_accel, name='mu/accel/mean'), tf.reduce_mean(sigma_steering, name='sigma/steering/mean'), tf.reduce_max(sigma_steering, name='sigma/steering/max'), tf.reduce_mean(sigma_accel, name='sigma/accel/mean'), tf.reduce_max(sigma_accel, name='sigma/accel/max'), sigma_beta_accel_exp, sigma_beta_steering_exp, ) # actions = tf.Print(actions, [mus, sigmas, tf.concat([sigma_steering_, sigma_accel_], -1), actions], # 'mu/sigma/sigma.orig/act=', summarize=4) if not hasattr(self, '_weights_actor'): self._weights_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) return actions, value, dists
def training(self): return get_current_tower_context().is_training
def _build_graph(self, inputs): from tensorpack.tfutils.common import get_global_step_var state, action, futurereward, advantage = inputs is_training = get_current_tower_context().is_training policy, value, dists = self._get_NN_prediction(state) if not hasattr(self, '_weights_train'): self._weights_train = self._weights_critic + self._weights_actor self.value = tf.squeeze(value, [1], name='value') # (B,) self.policy = tf.identity(policy, name='policy') with tf.variable_scope("Pred") as vs: __p, __v, _ = self._get_NN_prediction(state) __v = tf.squeeze(__v, [1], name='value') # (B,) __p = tf.identity(__p, name='policy') if not hasattr(self, '_weights_pred'): self._weights_pred = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=vs.name) assert (len(self._weights_train) == len(self._weights_pred)) assert (not hasattr(self, '_sync_op')) self._sync_op = tf.group(*[d.assign(s + tf.truncated_normal(tf.shape(s), stddev=0.02)) for d, s in zip(self._weights_pred, self._weights_train)]) with tf.variable_scope('pre') as vs: pre_p,pre_v,pre_dists=self._get_NN_prediction(state) if not hasattr(self,'pre_weights'): self.pre_weights=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,scope=vs.name) self._td_sync_op = tf.group(*[d.assign(s) for d, s in zip(self.pre_weights, self._weights_train)]) if not is_training: return # advantage = tf.subtract(tf.stop_gradient(self.value), futurereward, name='advantage') # advantage = tf.Print(advantage, [self.value, futurereward, action, advantage], 'value/reward/act/advantage=', summarize=4) log_probs = dists.log_prob(action) #add ppo policy clip loss #add ratio ,surr1, surr2 pre_probs=pre_dists.log_prob(action) ratio=tf.exp(log_probs-pre_probs) prob_ratio = tf.reduce_mean(input_tensor=tf.concat(values=ratio, axis=1), axis=1) clip_param=tf.train.exponential_decay(CLIP_PARAMETER, get_global_step_var(), 10000, 0.98, name='clip_param') # surr1=prob_ratio*advantage surr1=ratio*tf.expand_dims(advantage, -1) surr2=tf.clip_by_value(ratio,1.0-clip_param,1.0+clip_param)*tf.expand_dims(advantage, -1) # surr2=tf.clip_by_value(prob_ratio,1.0-clip_param,1.0+clip_param)*advantage loss_policy=-tf.reduce_mean(tf.minimum(surr1,surr2)) #add critic clip loss v_loss1=tf.square(value-futurereward) pre_value=pre_v+tf.clip_by_value(value-pre_v,-clip_param,clip_param) v_loss2=tf.square(pre_v-futurereward) # loss_value=0.5*tf.reduce_mean(tf.maximum(v_loss1,v_loss2)) loss_value=0.5*tf.reduce_mean(v_loss1) entropy = dists.entropy() entropy_beta = tf.get_variable('entropy_beta', shape=[], initializer=tf.constant_initializer(0.01), trainable=False) exp_v = entropy_beta * entropy loss_entropy = tf.reduce_mean(-tf.reduce_sum(exp_v, axis=-1), name='loss/policy') loss_policy=loss_policy+loss_entropy # exp_v = tf.transpose( # tf.multiply(tf.transpose(log_probs), advantage)) # exp_v = tf.multiply(log_probs, advantage) # exp_v = log_probs * tf.expand_dims(advantage, -1) # entropy = dists.entropy() # entropy_beta = tf.get_variable('entropy_beta', shape=[], # initializer=tf.constant_initializer(0.01), trainable=False) # exp_v = entropy_beta * entropy + exp_v # loss_value = tf.reduce_mean(0.5 * tf.square(self.value - futurereward)) # loss_entropy = tf.reduce_mean(tf.reduce_sum(entropy, axis=-1), name='xentropy_loss') from tensorflow.contrib.layers.python.layers.regularizers import apply_regularization, l2_regularizer loss_l2_regularizer = apply_regularization(l2_regularizer(1e-4), self._weights_critic) loss_l2_regularizer = tf.identity(loss_l2_regularizer, 'loss/l2reg') loss_value += loss_l2_regularizer loss_value = tf.identity(loss_value, name='loss/value') # self.cost = tf.add_n([loss_policy, loss_value * 0.1, loss_l2_regularizer]) self._cost = [loss_policy, loss_value ] from autodrive.trainer.summary import addParamSummary addParamSummary([('.*', ['rms', 'absmax'])]) pred_reward = tf.reduce_mean(self.value, name='predict_reward') advantage = symbf.rms(advantage, name='rms_advantage') summary.add_moving_summary(loss_policy, loss_value, loss_entropy, pred_reward, advantage, loss_l2_regularizer, tf.reduce_mean(self.policy[:, 0], name='action/steering/mean'), tf.reduce_mean(self.policy[:, 1], name='action/accel/mean'), )
def _build_graph(self, input_vars): image, label = input_vars image = tf.image.convert_image_dtype(image, dtype=tf.float32) ctx = get_current_tower_context() print("train or test tower context?", ctx.is_training) tf.summary.image("Input Image", image[0:20], max_outputs=20) #.astype("uint8")) def conv(name, l, channel, stride, kernel_size): #rand_seed = np.random.randint(2**32-1) #np.random.seed(None) conv2d_he = Conv2D( name, l, channel, kernel_size, stride=stride, nl=tf.identity, use_bias=False, W_init=tf.variance_scaling_initializer(dtype=tf.float32)) #tf.random_normal_initializer(stddev=np.sqrt(2.0/9/channel)))#tf.contrib.layers.variance_scaling_initializer(factor=2.0, mode='FAN_AVG', uniform=False)) #np.random.seed(rand_seed) return conv2d_he def batch_norm(scope, name, layer, decay, layer_num, norm_pattern, training): with tf.variable_scope(scope) as s: if training: layer = BatchNorm( name, layer) if layer_num % norm_pattern != 0 else layer else: if decay is not None: layer = BatchNorm( name, layer, decay=decay, use_local_stat=False ) if layer_num % norm_pattern != 0 else layer else: layer = BatchNorm( name, layer, use_local_stat=False ) if layer_num % norm_pattern != 0 else layer return layer def add_layer(name, l, kernel_size, growth_rate, drop_rate, training, layer_num, drop_pattern, bn_momentum, skip_norm): shape = l.get_shape().as_list() in_channel = shape[3] with tf.variable_scope(name) as scope: # layer num mod 1 for bnorm every layer c = batch_norm(name, 'bn.{}'.format(layer_num), l, bn_momentum, layer_num, skip_norm, training) #epsilon=0.001 c = tf.nn.relu(c) c = conv('conv1', c, growth_rate, 1, kernel_size) l = tf.concat([c, l], 3) if drop_pattern != 0 and layer_num % drop_pattern == 0: spatial_drop = tf.shape(l) # drop every layer mod drop_pattern, drop_pattern == 0 if no drop wanted l = tf.cond( tf.equal(tf.constant(training), tf.constant(True)), lambda: tf.nn. dropout(l, keep_prob=tf.constant(drop_rate), noise_shape= [spatial_drop[0], 1, 1, spatial_drop[3]], name='dropblock'), lambda: l) return l def add_transition(name, l, drop_rate, training, drop_pattern, transition_number, bn_momentum): shape = l.get_shape().as_list() in_channel = shape[3] with tf.variable_scope(name) as scope: l = batch_norm(name, 'bntransit.{}'.format(transition_number), l, bn_momentum, 42, 43, training) l = tf.nn.relu(l) l = Conv2D('conv1', l, in_channel, 1, stride=1, use_bias=False, nl=tf.nn.relu) if drop_pattern != 0: l = tf.cond( tf.equal(tf.constant(training), tf.constant(True)), lambda: tf.nn.dropout(l, keep_prob=tf.constant(drop_rate), name='droptransition'), lambda: l) l = AvgPooling('pool', l, 2) return l def dense_net(name): l = conv('conv0', image, self.filters_init, 1, self.kernel_size) with tf.variable_scope('block1') as scope: for i in range(self.N): #(name, l, kernel_size, growth_rate, drop_rate, training, layer_num, drop_pattern): l = add_layer(name='dense_layer.{}'.format(i), l=l, kernel_size=self.kernel_size, growth_rate=self.growthRate, drop_rate=self.drop_rate, training=self.train_or_test, layer_num=i, drop_pattern=0, bn_momentum=self.bn_momentum, skip_norm=self.skip_norm) l = add_transition(name='transition1', l=l, drop_rate=self.drop_rate, training=self.train_or_test, drop_pattern=self.drop_pattern, transition_number=1, bn_momentum=self.bn_momentum) with tf.variable_scope('block2') as scope: for i in range(self.N): l = add_layer('dense_layer.{}'.format(i), l, self.kernel_size, self.growthRate, self.drop_rate, self.train_or_test, i, self.drop_pattern, self.bn_momentum, self.skip_norm) l = add_transition('transition2', l, self.drop_rate, self.train_or_test, self.drop_pattern, 2, self.bn_momentum) with tf.variable_scope('block3') as scope: for i in range(self.N): l = add_layer('dense_layer.{}'.format(i), l, self.kernel_size, self.growthRate, self.drop_rate, self.train_or_test, i, self.drop_pattern, self.bn_momentum, self.skip_norm) l = batch_norm(name, 'bnlast', l, self.bn_momentum, 42, 42 + 1, self.train_or_test) l = tf.nn.relu(l) l = GlobalAvgPooling('gap', l) logits = FullyConnected('linear', l, out_dim=2, nl=tf.identity) return logits def prediction_incorrect(logits, label, topk=1, name='incorrect_vector'): #with tf.name_scope('prediction_incorrect'): x = tf.logical_not(tf.nn.in_top_k(logits, label, topk)) return tf.cast(x, tf.float32, name=name) logits = dense_net("dense_net") #map probabilities to real domain prob = tf.nn.softmax( logits, name='output' ) #a generalization of the logistic function that "squashes" a K-dim vector z of arbitrary real values to a K-dim vector sigma( z ) of real values in the range [0, 1] that add up to 1. factorbl = (self.class_0 + self.class_1) / ( 2 * self.class_0 ) #tf.divide(tf.add(self.class_0, self.class_1), tf.multiply(tf.constant(2.0,dtype=tf.float32), self.class_0)) factordl = (self.class_0 + self.class_1) / ( 2 * self.class_1 ) #tf.divide(tf.add(self.class_0, self.class_1), tf.multiply(tf.constant(2.0,dtype=tf.float32), self.class_1)) class_weights = tf.constant([factorbl, factordl]) weights = tf.gather(class_weights, label) cost = tf.losses.sparse_softmax_cross_entropy( label, logits, weights=weights ) #False positive 3* False negatives so adjust weight by factor cost = tf.reduce_mean(cost, name='cross_entropy_loss') #normalize wrong = prediction_incorrect(logits, label) # monitor training error add_moving_summary(tf.reduce_mean(wrong, name='train_error')) # weight decay on all W wd_reg = tf.constant(self.weight_decay_rate, dtype=tf.float32) wd_cost = tf.multiply(wd_reg, regularize_cost('.*/W', tf.nn.l2_loss), name='wd_cost') add_moving_summary(cost, wd_cost) add_param_summary(('.*/W', ['histogram'])) # monitor W self.cost = tf.add_n([cost, wd_cost], name='cost') return self.cost
def build_graph(self, x, image_target): with tf.name_scope("preprocess"): image_target = image_target / 255. def viz(name, images): with tf.name_scope(name): im = tf.concat(images, axis=2) #im = tf.transpose(im, [0, 2, 3, 1]) if self._act_input == tf.tanh: im = (im + 1.0) * 127.5 else: im = im * 255 im = tf.clip_by_value(im, 0, 255) im = tf.round(im) im = tf.cast(im, tf.uint8, name="viz") return im # calculate gram_target _, gram_target = self._build_extractor(image_target, name="ext_target") # inference pre_image_output from pre_image_input and gram_target self.image_outputs = list() self.loss_per_stage = list() x_output = x with tf.variable_scope("syn"): # use data stats in both train and test phases with argscope(BatchNorm, training=True): for s in range(self._n_stage): # get the first (s+1) coefs coefs = OrderedDict() for k in list(SynTexModelDesc.DEFAULT_COEFS.keys())[:s + 1]: coefs[k] = SynTexModelDesc.DEFAULT_COEFS[k] x_image, loss_input, _, x_output = \ self.build_stage(x_output, gram_target, coefs, name="stage%d" % s) self.image_outputs.append(x_image) self.loss_per_stage.append( tf.reduce_mean(loss_input, name="loss%d" % s)) self.collect_variables("syn") # image_output = self._act_input(x_output, name="output") loss_output, loss_per_layer_output, _ = \ self._build_loss(image_output, gram_target, calc_grad=False) self.image_outputs.append(image_output) self.loss_per_stage.append( tf.reduce_mean(loss_output, name="loss_output")) self.loss_per_layer_output = OrderedDict() with tf.name_scope("loss_per_layer_output"): for layer in loss_per_layer_output: self.loss_per_layer_output[layer] = tf.reduce_mean( loss_per_layer_output[layer], name=layer) # average losses from all stages weights = [1.] for _ in range(len(self.loss_per_stage) - 1): weights.append(weights[-1] * self._loss_scale) # skip the first loss as it is computed from noise self.loss = tf.add_n([weights[i] * loss \ for i, loss in enumerate(reversed(self.loss_per_stage[1:]))], name="loss") # summary #with tf.device("/cpu:0"): stages_target = viz("stages-target", self.image_outputs + [image_target]) ctx = get_current_tower_context() if ctx is not None and ctx.is_main_training_tower: tf.summary.image("stages-target", stages_target, max_outputs=10, collections=["image_summaries"]) add_moving_summary(self.loss, *self.loss_per_stage, *self.loss_per_layer_output.values())
def build_graph(self, seq, tseq): batch_size = self.bs_per_gpu dynamic_seq_len = tf.shape(seq)[1] labels = tf.reshape(tseq, [-1]) DROPOUT = 0.5 with argscope( [ Conv2D, Deconv2D, GroupedConv2D, AvgPooling, MaxPooling, BatchNorm, GlobalAvgPooling, ResizeImages, SeparableConv2D ], data_format=self.data_format ), \ argscope( [Conv2D, Deconv2D, GroupedConv2D, SeparableConv2D], activation=tf.identity, use_bias=self.options.use_bias ), \ argscope( [BatchNorm], center=False, scale=False, decay=self.options.batch_norm_decay, epsilon=self.options.batch_norm_epsilon ), \ argscope( [candidate_gated_layer], eps=self.options.candidate_gate_eps ): is_training = get_current_tower_context().is_training initializer = tf.random_uniform_initializer( -self.init_range, self.init_range) # B x seqlen x hidden seq, embedding_w = self._embed_input_if_int( seq, initializer=initializer) seq = self.locked_dropout(seq, self.keep_prob_i) hid_to_fs_params = _init_feature_select( self.layer_info_list, 'master', self.options.feat_sel_lambda) l_hallu_costs = [] self.basic_cells = basic_cells = [ self._basic_cell(initializer=initializer, hid_to_fs_params=hid_to_fs_params, l_hallu_costs=l_hallu_costs) for _ in range(self.num_lstms) ] cells = rnn.MultiRNNCell(basic_cells) self.state = tuple([ basic_cells[k].get_state_var( self.state_var_names[k], batch_size) \ for k in range(self.num_lstms) ]) self.last_state = tuple([ basic_cells[k].get_state_var( self.last_state_var_names[k] + '_last', batch_size) \ for k in range(self.num_lstms) ]) self._update_init_state_op = self.update_init_state() with tf.control_dependencies([self._update_init_state_op]): with tf.variable_scope('RNN', initializer=initializer): outputs, last_state = tf.nn.dynamic_rnn( cells, seq, initial_state=self.state, parallel_iterations=self.max_len) # for the update op self._update_last_state_op = self.update_last_state( tf.stop_gradient(last_state)) with tf.control_dependencies([self._update_last_state_op]): seqout, sum_hallu_costs = basic_cells[-1].split_outputs( outputs) seqout = self.locked_dropout(seqout, self.keep_prob) flat_seqout = tf.reshape(seqout, [-1, self.num_units]) # compute logits and prediction log loss if self.lock_embedding: logits = self.linear_with_embedding_w(flat_seqout, embedding_w) else: logits = FullyConnected('linear', flat_seqout, self.vocab_size, activation=tf.identity) logloss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels) per_seq_logloss = tf.reduce_sum(tf.reshape(logloss, [self.bs_per_gpu, -1]), axis=1, name="per_seq_sum_logloss") cost = tf.truediv(tf.reduce_sum(logloss), tf.cast(self.bs_per_gpu, tf.float32), name='avg_batch_cost') float_seq_len = tf.cast(dynamic_seq_len, tf.float32, name='seq_len') # # tensorpack bullshits. Inferencer must use tensors # # so we have to create a tensor .... # test_time_udpate = self.update_state( # [per_seq_logloss], name='test_time_update') # with tf.control_dependencies([test_time_udpate]): # self._inference_update_tensor = tf.multiply( # cost, 1.0001, name=self._inference_update_tensor_name) perpl = tf.identity(tf.exp(cost / float_seq_len), name='perplexity') add_moving_summary(perpl, cost, float_seq_len) # regularization if self.rnn_l2_reg: cost += (self.rnn_l2_reg * tf.reduce_sum(seqout**2) / tf.to_float(self.bs_per_gpu)) if self.rnn_slowness_reg: assert self.t_dim == 1 all_h_diff = tf.reduce_sum( (seqout[:, 1:, :] - seqout[:, :-1, :])**2) cost += (self.rnn_slowness_reg * all_h_diff / tf.to_float(self.bs_per_gpu)) wd_w = self.options.regularize_const if self.params_to_regularize is not None and wd_w: wd_cost = wd_w * regularize_cost(self.params_to_regularize, tf.nn.l2_loss) wd_cost = tf.identity(wd_cost, name='wd_cost') add_moving_summary(wd_cost) cost += wd_cost cost = tf.identity(cost, name='rnn_reg_cost') add_moving_summary(cost) # hallucination costs if l_hallu_costs: sum_hallu_costs = tf.identity(sum_hallu_costs, name='hallu_cost') add_moving_summary(sum_hallu_costs) cost += sum_hallu_costs # this computes some gradient norms self._build_hallu_stats_graph(cost) # scale the loss according the to sequence length self.cost = tf.identity(cost * float_seq_len / np.float32(self.max_len), name='cost') add_moving_summary(self.cost) return self.cost
def QuantizedWeight(name, x, n, nbit=2): """ Quantize weight. Args: x (tf.Tensor): a 4D tensor. Must have known number of channels, but can have other unknown dimensions. name (str): operator's name. n (int or double): variance of weight initialization. nbit (int): number of bits of quantized weight. Defaults to 2. Returns: tf.Tensor with attribute `variables`. Variable Names: * ``basis``: basis of quantized weight. Note: About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed by main training tower. This is consistent with most frameworks. """ num_filters = x.get_shape().as_list()[-1] init_basis = [] base = NORM_PPF_0_75 * ((2. / n)**0.5) / (2**(nbit - 1)) for j in range(nbit): init_basis.append([(2**j) * base for i in range(num_filters)]) init_basis = tf.constant_initializer(init_basis) bit_dims = [nbit, num_filters] num_levels = 2**nbit delta = EPS # initialize level multiplier init_level_multiplier = [] for i in range(num_levels): level_multiplier_i = [0. for j in range(nbit)] level_number = i for j in range(nbit): binary_code = level_number % 2 if binary_code == 0: binary_code = -1 level_multiplier_i[j] = float(binary_code) level_number = level_number // 2 init_level_multiplier.append(level_multiplier_i) # initialize threshold multiplier init_thrs_multiplier = [] for i in range(1, num_levels): thrs_multiplier_i = [0. for j in range(num_levels)] thrs_multiplier_i[i - 1] = 0.5 thrs_multiplier_i[i] = 0.5 init_thrs_multiplier.append(thrs_multiplier_i) with tf.variable_scope(name): basis = tf.get_variable('basis', bit_dims, tf.float32, initializer=init_basis, trainable=False) level_codes = tf.constant(init_level_multiplier) thrs_multiplier = tf.constant( init_thrs_multiplier ) # ValueError: Cannot create a tensor proto whose content is larger than 2GB. sum_multiplier = tf.constant( 1., shape=[1, tf.reshape(x, [-1, num_filters]).get_shape()[0]]) sum_multiplier_basis = tf.constant(1., shape=[1, nbit]) # calculate levels and sort levels = tf.matmul(level_codes, basis) levels, sort_id = tf.nn.top_k(tf.transpose(levels, [1, 0]), num_levels) levels = tf.reverse(levels, [-1]) sort_id = tf.reverse(sort_id, [-1]) levels = tf.transpose(levels, [1, 0]) sort_id = tf.transpose(sort_id, [1, 0]) # calculate threshold thrs = tf.matmul(thrs_multiplier, levels) # calculate level codes per channel reshape_x = tf.reshape(x, [-1, num_filters]) level_codes_channelwise_dims = tf.stack( [num_levels * num_filters, nbit]) level_codes_channelwise = tf.fill(level_codes_channelwise_dims, 0.) for i in range(num_levels): eq = tf.equal(sort_id, i) level_codes_channelwise = tf.where( tf.reshape(eq, [-1]), level_codes_channelwise + level_codes[i], level_codes_channelwise) level_codes_channelwise = tf.reshape(level_codes_channelwise, [num_levels, num_filters, nbit]) # calculate output y and its binary code y = tf.zeros_like(x) + levels[0] # output zero_dims = tf.stack([tf.shape(reshape_x)[0] * num_filters, nbit]) bits_y = tf.fill(zero_dims, -1.) zero_y = tf.zeros_like(x) zero_bits_y = tf.fill(zero_dims, 0.) zero_bits_y = tf.reshape(zero_bits_y, [-1, num_filters, nbit]) for i in range(num_levels - 1): g = tf.greater(x, thrs[i]) y = tf.where(g, zero_y + levels[i + 1], y) bits_y = tf.where( tf.reshape(g, [-1]), tf.reshape(zero_bits_y + level_codes_channelwise[i + 1], [-1, nbit]), bits_y) bits_y = tf.reshape(bits_y, [-1, num_filters, nbit]) ctx = get_current_tower_context() # current tower context # training if ctx.is_main_training_tower: BT = tf.transpose(bits_y, [2, 0, 1]) # calculate BTxB BTxB = [] for i in range(nbit): for j in range(nbit): BTxBij = tf.multiply(BT[i], BT[j]) BTxBij = tf.matmul(sum_multiplier, BTxBij) if i == j: mat_one = tf.ones([1, num_filters]) BTxBij = BTxBij + (delta * mat_one) # + E BTxB.append(BTxBij) BTxB = tf.reshape(tf.stack(values=BTxB), [nbit, nbit, num_filters]) # calculate inverse of BTxB if nbit > 2: BTxB_transpose = tf.transpose(BTxB, [2, 0, 1]) # 1) naive # BTxB_inv = tf.matrix_inverse(BTxB_transpose) # 2) try, except try: BTxB_inv = tf.matrix_inverse(BTxB_transpose, adjoint=None, name=None) except: BTxB_ttt = tf.add( BTxB_transpose, tf.math.scalar_mul(tf.identity((BTxB_transpose.shape)), 1e-6)) BTxB_inv = tf.matrix_inverse(BTxB_ttt, adjoint=None, name=None) BTxB_inv = tf.transpose(BTxB_inv, [1, 2, 0]) elif nbit == 2: det = tf.multiply(BTxB[0][0], BTxB[1][1]) - tf.multiply( BTxB[0][1], BTxB[1][0]) inv = [] inv.append(BTxB[1][1] / det) inv.append(-BTxB[0][1] / det) inv.append(-BTxB[1][0] / det) inv.append(BTxB[0][0] / det) BTxB_inv = tf.reshape(tf.stack(values=inv), [nbit, nbit, num_filters]) elif nbit == 1: BTxB_inv = tf.reciprocal(BTxB) # calculate BTxX BTxX = [] for i in range(nbit): BTxXi0 = tf.multiply(BT[i], reshape_x) BTxXi0 = tf.matmul(sum_multiplier, BTxXi0) BTxX.append(BTxXi0) BTxX = tf.reshape(tf.stack(values=BTxX), [nbit, num_filters]) BTxX = BTxX + (delta * basis) # + basis # calculate new basis new_basis = [] for i in range(nbit): new_basis_i = tf.multiply(BTxB_inv[i], BTxX) new_basis_i = tf.matmul(sum_multiplier_basis, new_basis_i) add_moving_summary( tf.reduce_mean(new_basis_i, name='new_basis_bit' + str(i))) new_basis.append(new_basis_i) new_basis = tf.reshape(tf.stack(values=new_basis), [nbit, num_filters]) # create moving averages op updata_moving_basis = moving_averages.assign_moving_average( basis, new_basis, MOVING_AVERAGES_FACTOR) add_model_variable(basis) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updata_moving_basis) # add_moving_summary(tf.identity(basis, name='basis'), tf.identity(new_basis, name='basis_new')) # add_moving_summary(tf.identity(basis, name='basis')) y = x + tf.stop_gradient(-x) + tf.stop_gradient(y) # gradient: y=x y.variables = VariableHolder(basis=basis) return y
def QuantizedActiv(x, nbit=2): """ Quantize activation. Args: x (tf.Tensor): a 4D tensor. nbit (int): number of bits of quantized activation. Defaults to 2. Returns: tf.Tensor with attribute `variables`. Variable Names: * ``basis``: basis of quantized activation. Note: About multi-GPU training: moving averages across GPUs are not aggregated. Batch statistics are computed by main training tower. This is consistent with most frameworks. """ init_basis = [(NORM_PPF_0_75 * 2 / (2**nbit - 1)) * (2.**i) for i in range(nbit)] init_basis = tf.constant_initializer(init_basis) bit_dims = [nbit, 1] num_levels = 2**nbit # initialize level multiplier init_level_multiplier = [] for i in range(0, num_levels): level_multiplier_i = [0. for j in range(nbit)] level_number = i for j in range(nbit): level_multiplier_i[j] = float(level_number % 2) level_number = level_number // 2 init_level_multiplier.append(level_multiplier_i) # initialize threshold multiplier init_thrs_multiplier = [] for i in range(1, num_levels): thrs_multiplier_i = [0. for j in range(num_levels)] thrs_multiplier_i[i - 1] = 0.5 thrs_multiplier_i[i] = 0.5 init_thrs_multiplier.append(thrs_multiplier_i) with tf.variable_scope('ActivationQuantization'): basis = tf.get_variable('basis', bit_dims, tf.float32, initializer=init_basis, trainable=False) ctx = get_current_tower_context() # current tower context # calculate levels and sort level_codes = tf.constant(init_level_multiplier) levels = tf.matmul(level_codes, basis) levels, sort_id = tf.nn.top_k(tf.transpose(levels, [1, 0]), num_levels) levels = tf.reverse(levels, [-1]) sort_id = tf.reverse(sort_id, [-1]) levels = tf.transpose(levels, [1, 0]) sort_id = tf.transpose(sort_id, [1, 0]) # calculate threshold thrs_multiplier = tf.constant(init_thrs_multiplier) thrs = tf.matmul(thrs_multiplier, levels) # calculate output y and its binary code y = tf.zeros_like(x) # output reshape_x = tf.reshape(x, [-1]) zero_dims = tf.stack([tf.shape(reshape_x)[0], nbit]) bits_y = tf.fill(zero_dims, 0.) zero_y = tf.zeros_like(x) zero_bits_y = tf.fill(zero_dims, 0.) for i in range(num_levels - 1): g = tf.greater(x, thrs[i]) y = tf.where(g, zero_y + levels[i + 1], y) bits_y = tf.where(tf.reshape(g, [-1]), zero_bits_y + level_codes[sort_id[i + 1][0]], bits_y) # training if ctx.is_main_training_tower: BT = tf.matrix_transpose(bits_y) # calculate BTxB BTxB = [] for i in range(nbit): for j in range(nbit): BTxBij = tf.multiply(BT[i], BT[j]) BTxBij = tf.reduce_sum(BTxBij) # all dimensions are reduced, and a tensor with a single element is returned. i.e. 6 BTxB.append(BTxBij) BTxB = tf.reshape(tf.stack(values=BTxB), [nbit, nbit]) # 1) naive # BTxB_inv = tf.matrix_inverse(BTxB) # 2) try excpet ->doesn't work well due to poor tf.matrix_inverse # try: # BTxB_inv = tf.matrix_inverse(BTxB, adjoint=None, name=None) # except: # BTxB_ttt = tf.add(BTxB, tf.math.scalar_mul(tf.identity((BTxB.shape)), 1e-4)) # BTxB_inv = tf.matrix_inverse(BTxB_ttt, adjoint=None, name=None) # calculate BTxX BTxX = [] for i in range(nbit): BTxXi0 = tf.multiply(BT[i], reshape_x) BTxXi0 = tf.reduce_sum(BTxXi0) BTxX.append(BTxXi0) BTxX = tf.reshape(tf.stack(values=BTxX), [nbit, 1]) # new_basis = tf.matmul(BTxB_inv, BTxX) # calculate new basis # 3) gaussian elimination new_basis = tf.linalg.lstsq(BTxB, BTxX, fast=False, l2_regularizer=1e-5) # create moving averages op updata_moving_basis = moving_averages.assign_moving_average( basis, new_basis, MOVING_AVERAGES_FACTOR) add_model_variable(basis) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, updata_moving_basis) for i in range(nbit): tf.summary.scalar('basis%d' % i, new_basis[i][0]) x_clip = tf.minimum(x, levels[num_levels - 1]) # gradient clip y = x_clip + tf.stop_gradient(-x_clip) + tf.stop_gradient( y) # gradient: y=clip(x) y.variables = VariableHolder(basis=basis) return y