def __init__(self, config: GroverConfig, is_training, input_ids, cache=None, do_cache=False, pad_token_id=0, chop_off_last_token=True, scope=None, reuse=False): """ :param config: :param is_training: :param input_ids: Tensor thats of size [batch_size, seq_length] :param cache: Optionally, a tensor to use that will contain cached information of the size [batch_size, num_layers, 2, num_heads, cache_length, features] :param do_cache: Whether to cache again. :param pad_token_id: Which token will be used for padding (probably 0.) :param chop_off_last_token: True if we will end up using this for TRAINING only. False if we want to generate. it means the last token in input_ids will not be processed by the model as input :param scope: scope to run this on """ self.config = copy.deepcopy(config) self.is_training = is_training self.pad_token_id = pad_token_id if not is_training: self.config.hidden_dropout_prob = 0.0 self.config.attention_probs_dropout_prob = 0.0 if chop_off_last_token: self.target_ids = input_ids[:, 1:] self.input_ids = input_ids[:, :-1] else: self.input_ids = input_ids self.target_ids = tf.concat((input_ids[:, 1:], tf.constant(self.pad_token_id, dtype=self.input_ids.dtype, shape=[get_shape_list(self.input_ids, 2)[0], 1])), 1) self.batch_size, self.seq_length = get_shape_list(self.input_ids, 2) if cache is None: caches = [None] * config.num_hidden_layers self.cache_length = 0 else: batch_size_, num_layers_, two_, num_heads_, self.cache_length, features_ = get_shape_list( cache, expected_rank=6) assert batch_size_ == self.batch_size assert num_layers_ == config.num_hidden_layers assert two_ == 2 assert num_heads_ == config.num_attention_heads assert features_ == (config.hidden_size // config.num_attention_heads) caches = tf.unstack(cache, axis=1) with tf.variable_scope(scope, default_name='newslm', reuse=reuse): with tf.variable_scope("embeddings"): embeddings, self.embedding_table = embed(self.input_ids, config.vocab_size, config.hidden_size, position_offset=self.cache_length, initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, use_one_hot_embeddings=True) mask = get_attention_mask(self.seq_length, self.seq_length + self.cache_length, dtype=embeddings.dtype) # We keep the representation as a 2D tensor to avoid re-shaping it back and # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on # the GPU/CPU but may not be free on the TPU, so we want to minimize them to # help the optimizer. hidden_state = tf.reshape(embeddings, [self.batch_size * self.seq_length, self.config.hidden_size]) new_kvs = [] for layer_idx, layer_cache in enumerate(caches): with tf.variable_scope('layer{:02d}'.format(layer_idx)): # [batch_size * seq_length, hidden_size] attention_output, new_kv = attention_layer( hidden_state, mask, batch_size=self.batch_size, seq_length=self.seq_length, size_per_head=config.hidden_size // config.num_attention_heads, num_attention_heads=config.num_attention_heads, initializer_range=config.initializer_range, hidden_dropout_prob=self.config.hidden_dropout_prob, attention_probs_dropout_prob=self.config.attention_probs_dropout_prob, do_cache=do_cache, cache=layer_cache, ) new_kvs.append(new_kv) # [batch_size * seq_length, hidden_size] hidden_state = residual_mlp_layer(hidden_state + attention_output, intermediate_size=config.intermediate_size, hidden_dropout_prob=self.config.hidden_dropout_prob) self.hidden_state = hidden_state self.new_kvs = tf.stack(new_kvs, axis=1) if do_cache else None # Note that the hidden state is still flat (batch_size*hidden_size) self.logits_flat = tf.matmul(self.hidden_state, self.embedding_table, transpose_b=True)
def inception_v3(inputs, dropout_keep_prob=0.8, num_classes=1000, is_training=True, restore_logits=True, scope=''): """Latest Inception from http://arxiv.org/abs/1512.00567. "Rethinking the Inception Architecture for Computer Vision" Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, Zbigniew Wojna Args: inputs: a tensor of size [batch_size, height, width, channels]. dropout_keep_prob: dropout keep_prob. num_classes: number of predicted classes. is_training: whether is training or not. restore_logits: whether or not the logits layers should be restored. Useful for fine-tuning a model with different num_classes. scope: Optional scope for name_scope. Returns: a list containing 'logits', 'aux_logits' Tensors. """ # end_points will collect relevant activations for external use, for example # summaries or losses. end_points = {} with tf.name_scope(scope, 'inception_v3', [inputs]): with scopes.arg_scope( [ops.conv2d, ops.fc, ops.batch_norm, ops.dropout], is_training=is_training): with scopes.arg_scope([ops.conv2d, ops.max_pool, ops.avg_pool], stride=1, padding='VALID'): # 299 x 299 x 3 end_points['conv0'] = ops.conv2d(inputs, 32, [3, 3], stride=2, scope='conv0') # 149 x 149 x 32 end_points['conv1'] = ops.conv2d(end_points['conv0'], 32, [3, 3], scope='conv1') # 147 x 147 x 32 end_points['conv2'] = ops.conv2d(end_points['conv1'], 64, [3, 3], padding='SAME', scope='conv2') # 147 x 147 x 64 end_points['pool1'] = ops.max_pool(end_points['conv2'], [3, 3], stride=2, scope='pool1') # 73 x 73 x 64 end_points['conv3'] = ops.conv2d(end_points['pool1'], 80, [1, 1], scope='conv3') # 73 x 73 x 80. end_points['conv4'] = ops.conv2d(end_points['conv3'], 192, [3, 3], scope='conv4') # 71 x 71 x 192. end_points['pool2'] = ops.max_pool(end_points['conv4'], [3, 3], stride=2, scope='pool2') # 35 x 35 x 192. net = end_points['pool2'] # Inception blocks with scopes.arg_scope([ops.conv2d, ops.max_pool, ops.avg_pool], stride=1, padding='SAME'): # mixed: 35 x 35 x 256. with tf.variable_scope('mixed_35x35x256a'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 64, [1, 1]) with tf.variable_scope('branch5x5'): branch5x5 = ops.conv2d(net, 48, [1, 1]) branch5x5 = ops.conv2d(branch5x5, 64, [5, 5]) with tf.variable_scope('branch3x3dbl'): branch3x3dbl = ops.conv2d(net, 64, [1, 1]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 32, [1, 1]) net = tf.concat( [branch1x1, branch5x5, branch3x3dbl, branch_pool], 3) end_points['mixed_35x35x256a'] = net # mixed_1: 35 x 35 x 288. with tf.variable_scope('mixed_35x35x288a'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 64, [1, 1]) with tf.variable_scope('branch5x5'): branch5x5 = ops.conv2d(net, 48, [1, 1]) branch5x5 = ops.conv2d(branch5x5, 64, [5, 5]) with tf.variable_scope('branch3x3dbl'): branch3x3dbl = ops.conv2d(net, 64, [1, 1]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 64, [1, 1]) net = tf.concat( [branch1x1, branch5x5, branch3x3dbl, branch_pool], 3) end_points['mixed_35x35x288a'] = net # mixed_2: 35 x 35 x 288. with tf.variable_scope('mixed_35x35x288b'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 64, [1, 1]) with tf.variable_scope('branch5x5'): branch5x5 = ops.conv2d(net, 48, [1, 1]) branch5x5 = ops.conv2d(branch5x5, 64, [5, 5]) with tf.variable_scope('branch3x3dbl'): branch3x3dbl = ops.conv2d(net, 64, [1, 1]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 64, [1, 1]) net = tf.concat( [branch1x1, branch5x5, branch3x3dbl, branch_pool], 3) end_points['mixed_35x35x288b'] = net # mixed_3: 17 x 17 x 768. with tf.variable_scope('mixed_17x17x768a'): with tf.variable_scope('branch3x3'): branch3x3 = ops.conv2d(net, 384, [3, 3], stride=2, padding='VALID') with tf.variable_scope('branch3x3dbl'): branch3x3dbl = ops.conv2d(net, 64, [1, 1]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3]) branch3x3dbl = ops.conv2d(branch3x3dbl, 96, [3, 3], stride=2, padding='VALID') with tf.variable_scope('branch_pool'): branch_pool = ops.max_pool(net, [3, 3], stride=2, padding='VALID') net = tf.concat([branch3x3, branch3x3dbl, branch_pool], 3) end_points['mixed_17x17x768a'] = net # mixed4: 17 x 17 x 768. with tf.variable_scope('mixed_17x17x768b'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 192, [1, 1]) with tf.variable_scope('branch7x7'): branch7x7 = ops.conv2d(net, 128, [1, 1]) branch7x7 = ops.conv2d(branch7x7, 128, [1, 7]) branch7x7 = ops.conv2d(branch7x7, 192, [7, 1]) with tf.variable_scope('branch7x7dbl'): branch7x7dbl = ops.conv2d(net, 128, [1, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 128, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 128, [1, 7]) branch7x7dbl = ops.conv2d(branch7x7dbl, 128, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) net = tf.concat( [branch1x1, branch7x7, branch7x7dbl, branch_pool], 3) end_points['mixed_17x17x768b'] = net # mixed_5: 17 x 17 x 768. with tf.variable_scope('mixed_17x17x768c'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 192, [1, 1]) with tf.variable_scope('branch7x7'): branch7x7 = ops.conv2d(net, 160, [1, 1]) branch7x7 = ops.conv2d(branch7x7, 160, [1, 7]) branch7x7 = ops.conv2d(branch7x7, 192, [7, 1]) with tf.variable_scope('branch7x7dbl'): branch7x7dbl = ops.conv2d(net, 160, [1, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [1, 7]) branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) net = tf.concat( [branch1x1, branch7x7, branch7x7dbl, branch_pool], 3) end_points['mixed_17x17x768c'] = net # mixed_6: 17 x 17 x 768. with tf.variable_scope('mixed_17x17x768d'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 192, [1, 1]) with tf.variable_scope('branch7x7'): branch7x7 = ops.conv2d(net, 160, [1, 1]) branch7x7 = ops.conv2d(branch7x7, 160, [1, 7]) branch7x7 = ops.conv2d(branch7x7, 192, [7, 1]) with tf.variable_scope('branch7x7dbl'): branch7x7dbl = ops.conv2d(net, 160, [1, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [1, 7]) branch7x7dbl = ops.conv2d(branch7x7dbl, 160, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) net = tf.concat( [branch1x1, branch7x7, branch7x7dbl, branch_pool], 3) end_points['mixed_17x17x768d'] = net # mixed_7: 17 x 17 x 768. with tf.variable_scope('mixed_17x17x768e'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 192, [1, 1]) with tf.variable_scope('branch7x7'): branch7x7 = ops.conv2d(net, 192, [1, 1]) branch7x7 = ops.conv2d(branch7x7, 192, [1, 7]) branch7x7 = ops.conv2d(branch7x7, 192, [7, 1]) with tf.variable_scope('branch7x7dbl'): branch7x7dbl = ops.conv2d(net, 192, [1, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [7, 1]) branch7x7dbl = ops.conv2d(branch7x7dbl, 192, [1, 7]) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) net = tf.concat( [branch1x1, branch7x7, branch7x7dbl, branch_pool], 3) end_points['mixed_17x17x768e'] = net # Auxiliary Head logits aux_logits = tf.identity(end_points['mixed_17x17x768e']) with tf.variable_scope('aux_logits'): aux_logits = ops.avg_pool(aux_logits, [5, 5], stride=3, padding='VALID') aux_logits = ops.conv2d(aux_logits, 128, [1, 1], scope='proj') # Shape of feature map before the final layer. shape = aux_logits.get_shape() aux_logits = ops.conv2d(aux_logits, 768, shape[1:3], stddev=0.01, padding='VALID') aux_logits = ops.flatten(aux_logits) aux_logits = ops.fc(aux_logits, num_classes, activation=None, stddev=0.001, restore=restore_logits) end_points['aux_logits'] = aux_logits # mixed_8: 8 x 8 x 1280. # Note that the scope below is not changed to not void previous # checkpoints. # (TODO) Fix the scope when appropriate. with tf.variable_scope('mixed_17x17x1280a'): with tf.variable_scope('branch3x3'): branch3x3 = ops.conv2d(net, 192, [1, 1]) branch3x3 = ops.conv2d(branch3x3, 320, [3, 3], stride=2, padding='VALID') with tf.variable_scope('branch7x7x3'): branch7x7x3 = ops.conv2d(net, 192, [1, 1]) branch7x7x3 = ops.conv2d(branch7x7x3, 192, [1, 7]) branch7x7x3 = ops.conv2d(branch7x7x3, 192, [7, 1]) branch7x7x3 = ops.conv2d(branch7x7x3, 192, [3, 3], stride=2, padding='VALID') with tf.variable_scope('branch_pool'): branch_pool = ops.max_pool(net, [3, 3], stride=2, padding='VALID') net = tf.concat([branch3x3, branch7x7x3, branch_pool], 3) end_points['mixed_17x17x1280a'] = net # mixed_9: 8 x 8 x 2048. with tf.variable_scope('mixed_8x8x2048a'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 320, [1, 1]) with tf.variable_scope('branch3x3'): branch3x3 = ops.conv2d(net, 384, [1, 1]) branch3x3 = tf.concat([ ops.conv2d(branch3x3, 384, [1, 3]), ops.conv2d(branch3x3, 384, [3, 1]) ], 3) with tf.variable_scope('branch3x3dbl'): branch3x3dbl = ops.conv2d(net, 448, [1, 1]) branch3x3dbl = ops.conv2d(branch3x3dbl, 384, [3, 3]) branch3x3dbl = tf.concat([ ops.conv2d(branch3x3dbl, 384, [1, 3]), ops.conv2d(branch3x3dbl, 384, [3, 1]) ], 3) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) net = tf.concat( [branch1x1, branch3x3, branch3x3dbl, branch_pool], 3) end_points['mixed_8x8x2048a'] = net # mixed_10: 8 x 8 x 2048. with tf.variable_scope('mixed_8x8x2048b'): with tf.variable_scope('branch1x1'): branch1x1 = ops.conv2d(net, 320, [1, 1]) with tf.variable_scope('branch3x3'): branch3x3 = ops.conv2d(net, 384, [1, 1]) branch3x3 = tf.concat([ ops.conv2d(branch3x3, 384, [1, 3]), ops.conv2d(branch3x3, 384, [3, 1]) ], 3) with tf.variable_scope('branch3x3dbl'): branch3x3dbl = ops.conv2d(net, 448, [1, 1]) branch3x3dbl = ops.conv2d(branch3x3dbl, 384, [3, 3]) branch3x3dbl = tf.concat([ ops.conv2d(branch3x3dbl, 384, [1, 3]), ops.conv2d(branch3x3dbl, 384, [3, 1]) ], 3) with tf.variable_scope('branch_pool'): branch_pool = ops.avg_pool(net, [3, 3]) branch_pool = ops.conv2d(branch_pool, 192, [1, 1]) net = tf.concat( [branch1x1, branch3x3, branch3x3dbl, branch_pool], 3) end_points['mixed_8x8x2048b'] = net # Final pooling and prediction with tf.variable_scope('logits'): shape = net.get_shape() net = ops.avg_pool(net, shape[1:3], padding='VALID', scope='pool') # 1 x 1 x 2048 net = ops.dropout(net, dropout_keep_prob, scope='dropout') net = ops.flatten(net, scope='flatten') # 2048 logits = ops.fc(net, num_classes, activation=None, scope='logits', restore=restore_logits) # 1000 end_points['logits'] = logits end_points['predictions'] = tf.nn.softmax( logits, name='predictions') return logits, end_points
def expanded_conv(input_tensor, num_outputs, expansion_size=expand_input_by_factor(6), stride=1, rate=1, kernel_size=(3, 3), residual=True, normalizer_fn=None, split_projection=1, split_expansion=1, expansion_transform=None, depthwise_location='expansion', depthwise_channel_multiplier=1, endpoints=None, use_explicit_padding=False, padding='SAME', scope=None): """Depthwise Convolution Block with expansion. Builds a composite convolution that has the following structure expansion (1x1) -> depthwise (kernel_size) -> projection (1x1) Args: input_tensor: input num_outputs: number of outputs in the final layer. expansion_size: the size of expansion, could be a constant or a callable. If latter it will be provided 'num_inputs' as an input. For forward compatibility it should accept arbitrary keyword arguments. Default will expand the input by factor of 6. stride: depthwise stride rate: depthwise rate kernel_size: depthwise kernel residual: whether to include residual connection between input and output. normalizer_fn: batchnorm or otherwise split_projection: how many ways to split projection operator (that is conv expansion->bottleneck) split_expansion: how many ways to split expansion op (that is conv bottleneck->expansion) ops will keep depth divisible by this value. expansion_transform: Optional function that takes expansion as a single input and returns output. depthwise_location: where to put depthwise covnvolutions supported values None, 'input', 'output', 'expansion' depthwise_channel_multiplier: depthwise channel multiplier: each input will replicated (with different filters) that many times. So if input had c channels, output will have c x depthwise_channel_multpilier. endpoints: An optional dictionary into which intermediate endpoints are placed. The keys "expansion_output", "depthwise_output", "projection_output" and "expansion_transform" are always populated, even if the corresponding functions are not invoked. use_explicit_padding: Use 'VALID' padding for convolutions, but prepad inputs so that the output dimensions are the same as if 'SAME' padding were used. padding: Padding type to use if `use_explicit_padding` is not set. scope: optional scope. Returns: Tensor of depth num_outputs Raises: TypeError: on inval """ with tf.variable_scope(scope, default_name='expanded_conv') as s, \ tf.name_scope(s.original_name_scope): prev_depth = input_tensor.get_shape().as_list()[3] if depthwise_location not in [None, 'input', 'output', 'expansion']: raise TypeError( '%r is unknown value for depthwise_location' % depthwise_location) if use_explicit_padding: if padding != 'SAME': raise TypeError('`use_explicit_padding` should only be used with ' '"SAME" padding.') padding = 'VALID' depthwise_func = functools.partial( slim.separable_conv2d, num_outputs=None, kernel_size=kernel_size, depth_multiplier=depthwise_channel_multiplier, stride=stride, rate=rate, normalizer_fn=normalizer_fn, padding=padding, scope='depthwise') # b1 -> b2 * r -> b2 # i -> (o * r) (bottleneck) -> o input_tensor = tf.identity(input_tensor, 'input') net = input_tensor if depthwise_location == 'input': if use_explicit_padding: net = _fixed_padding(net, kernel_size, rate) net = depthwise_func(net, activation_fn=None) if callable(expansion_size): inner_size = expansion_size(num_inputs=prev_depth) else: inner_size = expansion_size if inner_size > net.shape[3]: net = split_conv( net, inner_size, num_ways=split_expansion, scope='expand', stride=1, normalizer_fn=normalizer_fn) net = tf.identity(net, 'expansion_output') if endpoints is not None: endpoints['expansion_output'] = net if depthwise_location == 'expansion': if use_explicit_padding: net = _fixed_padding(net, kernel_size, rate) net = depthwise_func(net) net = tf.identity(net, name='depthwise_output') if endpoints is not None: endpoints['depthwise_output'] = net if expansion_transform: net = expansion_transform(expansion_tensor=net, input_tensor=input_tensor) # Note in contrast with expansion, we always have # projection to produce the desired output size. net = split_conv( net, num_outputs, num_ways=split_projection, stride=1, scope='project', normalizer_fn=normalizer_fn, activation_fn=tf.identity) if endpoints is not None: endpoints['projection_output'] = net if depthwise_location == 'output': if use_explicit_padding: net = _fixed_padding(net, kernel_size, rate) net = depthwise_func(net, activation_fn=None) if callable(residual): # custom residual net = residual(input_tensor=input_tensor, output_tensor=net) elif ( residual and # stride check enforces that we don't add residuals when spatial # dimensions are None stride == 1 and # Depth matches net.get_shape().as_list()[3] == input_tensor.get_shape().as_list()[3]): net += input_tensor return tf.identity(net, name='output')
def __init__(self, sess, ob_space, action_space, nbatch, nsteps, reuse = False): # This will use to initialize our kernels gain = np.sqrt(2) # Based on the action space, will select what probability distribution type # we will use to distribute action in our stochastic policy (in our case DiagGaussianPdType # aka Diagonal Gaussian, 3D normal distribution self.pdtype = make_pdtype(action_space) height, weight, channel = ob_space.shape ob_shape = (height, weight, channel) # Create the input placeholder inputs_ = tf.placeholder(tf.float32, [None, *ob_shape], name="input") # Normalize the images scaled_images = tf.cast(inputs_, tf.float32) / 255. """ Build the model 3 CNN for spatial dependencies Temporal dependencies is handle by stacking frames (Something funny nobody use LSTM in OpenAI Retro contest) 1 common FC 1 FC for policy 1 FC for value """ with tf.variable_scope("model", reuse = reuse): conv1 = conv_layer(scaled_images, 32, 8, 4, gain) conv2 = conv_layer(conv1, 64, 4, 2, gain) conv3 = conv_layer(conv2, 64, 3, 1, gain) flatten1 = tf.layers.flatten(conv3) fc_common = fc_layer(flatten1, 512, gain=gain) # This build a fc connected layer that returns a probability distribution # over actions (self.pd) and our pi logits (self.pi). self.pd, self.pi = self.pdtype.pdfromlatent(fc_common, init_scale=0.01) # Calculate the v(s) vf = fc_layer(fc_common, 1, activation_fn=None)[:, 0] self.initial_state = None # Take an action in the action distribution (remember we are in a situation # of stochastic policy so we don't always take the action with the highest probability # for instance if we have 2 actions 0.7 and 0.3 we have 30% chance to take the second) a0 = self.pd.sample() # Function use to take a step returns action to take and V(s) def step(state_in, *_args, **_kwargs): action, value = sess.run([a0, vf], {inputs_: state_in}) #print("step", action) return action, value # Function that calculates only the V(s) def value(state_in, *_args, **_kwargs): return sess.run(vf, {inputs_: state_in}) # Function that output only the action to take def select_action(state_in, *_args, **_kwargs): return sess.run(a0, {inputs_: state_in}) self.inputs_ = inputs_ self.vf = vf self.step = step self.value = value self.select_action = select_action
def conv2d(x, kernel_size, stride, channels, is_training, scope="conv2d", batch_norm=False, residual=False, gated=False, activation_fn=tf.nn.relu, resize=False, transpose=False, stacked_layers=1): """2D-Conv with optional batch_norm, gating, residual. Args: x: Tensor input [MB, H, W, CH]. kernel_size: List [H, W]. stride: List [H, W]. channels: Int, output channels. is_training: Whether to collect stats for BatchNorm. scope: Enclosing scope name. batch_norm: Apply batch normalization residual: Residual connections, have stacked_layers >= 2. gated: Gating ala Wavenet. activation_fn: Nonlinearity function. resize: On transposed convolution, do ImageResize instead of conv_transpose. transpose: Use conv_transpose instead of conv. stacked_layers: Number of layers before a residual connection. Returns: x: Tensor output. """ # For residual x0 = x # Choose convolution function conv_fn = slim.conv2d_transpose if transpose else slim.conv2d # Double output channels for gates num_outputs = channels * 2 if gated else channels normalizer_fn = slim.batch_norm if batch_norm else None with tf.variable_scope(scope + "_Layer"): # Apply a stack of convolutions Before adding residual for layer_idx in range(stacked_layers): with slim.arg_scope( slim_batchnorm_arg_scope(is_training, activation_fn=None)): # Use interpolation to upsample instead of conv_transpose if transpose and resize: unused_mb, h, w, unused_ch = x.get_shape().as_list() x = tf.image.resize_images( x, size=[h * stride[0], w * stride[1]], method=0) stride_conv = [1, 1] else: stride_conv = stride x = conv_fn(inputs=x, stride=stride_conv, kernel_size=kernel_size, num_outputs=num_outputs, normalizer_fn=normalizer_fn, biases_initializer=tf.zeros_initializer(), scope=scope) if gated: with tf.variable_scope("Gated"): x1, x2 = x[:, :, :, :channels], x[:, :, :, channels:] if activation_fn: x1, x2 = activation_fn(x1), tf.sigmoid(x2) else: x2 = tf.sigmoid(x2) x = x1 * x2 # Apply residual to last layer before the last nonlinearity if residual and (layer_idx == stacked_layers - 1): with tf.variable_scope("Residual"): # Don't upsample residual in time if stride[0] == 1 and stride[1] == 1: channels_in = x0.get_shape().as_list()[-1] # Make n_channels match for residual if channels != channels_in: x0 = slim.conv2d( inputs=x0, stride=[1, 1], kernel_size=[1, 1], num_outputs=channels, normalizer_fn=None, activation_fn=None, biases_initializer=tf.zeros_initializer, scope=scope + "_residual") x += x0 else: x += x0 if activation_fn and not gated: x = activation_fn(x) return x
def make_model(self): hparams = transformer.transformer_small() hparams.hidden_size = 8 hparams.filter_size = 32 hparams.num_heads = 1 hparams.layer_prepostprocess_dropout = 0.0 if hparams.get("problem_hparams", None) is None: p_hparams = problem_hparams.test_problem_hparams( VOCAB_SIZE, VOCAB_SIZE, hparams) hparams.problem_hparams = p_hparams self.model = model_cls(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) self.placeholders['inputs'] = tf.placeholder( tf.int32, [None, params['maxLength']], name='inputs') self.placeholders['targets'] = tf.placeholder( tf.int32, [None, params['maxLength']], name='targets') features = { "inputs": self.placeholders['inputs'], "targets": self.placeholders['targets'], "target_space_id": 0 } self.logits = self.model(features) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logit=tf.reshape(self.logits, [-1, VOCAB_SIZE]), labels=tf.reshape(features["targets"], [-1])) self.ops['loss'] = tf.reduce_mean(loss) apply_grad = tf.train.AdamOptimizer(0.001).minimize(self.ops['loss']) self.ops['losses'] = [] for (internal_id, task_id) in enumerate(self.params['task_ids']): with tf.variable_scope("out_layer_task%i" % task_id): with tf.variable_scope("regression_gate"): self.weights['regression_gate_task%i' % task_id] = MLP( 2 * self.params['hidden_size'], 1, [], self.placeholders['out_layer_dropout_keep_prob']) with tf.variable_scope("regression"): self.weights[ 'regression_transform_task%i' % task_id] = MLP( self.params['hidden_size'], 1, [], self.placeholders['out_layer_dropout_keep_prob']) computed_values = self.gated_regression( self.ops['final_node_representations'], self.weights['regression_gate_task%i' % task_id], self.weights['regression_transform_task%i' % task_id]) diff = computed_values - self.placeholders['target_values'][ internal_id, :] task_target_mask = self.placeholders['target_mask'][ internal_id, :] task_target_num = tf.reduce_sum( task_target_mask) + SMALL_NUMBER diff = diff * task_target_mask # Mask out unused values self.ops['accuracy_task%i' % task_id] = tf.reduce_sum( tf.abs(diff)) / task_target_num task_loss = tf.reduce_sum( 0.5 * tf.square(diff)) / task_target_num # Normalise loss to account for fewer task-specific examples in batch: task_loss = task_loss * ( 1.0 / (self.params['task_sample_ratios'].get(task_id) or 1.0)) self.ops['losses'].append(task_loss) self.ops['loss'] = tf.reduce_sum(self.ops['losses'])
def quantizable_concat(inputs, axis, is_training, is_quantized=True, default_min=0, default_max=6, ema_decay=0.999, scope='quantized_concat'): """Concat replacement with quantization option. Allows concat inputs to share the same min max ranges, from experimental/gazelle/synthetic/model/tpu/utils.py. Args: inputs: list of tensors to concatenate. axis: dimension along which to concatenate. is_training: true if the graph is a training graph. is_quantized: flag to enable/disable quantization. default_min: default min value for fake quant op. default_max: default max value for fake quant op. ema_decay: the moving average decay for the quantization variables. scope: Optional scope for variable_scope. Returns: Tensor resulting from concatenation of input tensors """ if is_quantized: with tf.variable_scope(scope): tf.logging.info('inputs: {}'.format(inputs)) for t in inputs: tf.logging.info(t) min_var = _quant_var('min', default_min) max_var = _quant_var('max', default_max) if not is_training: # If we are building an eval graph just use the values in the variables. quant_inputs = [ tf.fake_quant_with_min_max_vars(t, min_var, max_var) for t in inputs ] tf.logging.info('min_val: {}'.format(min_var)) tf.logging.info('max_val: {}'.format(max_var)) else: concat_tensors = tf.concat(inputs, axis=axis) tf.logging.info('concat_tensors: {}'.format(concat_tensors)) # TFLite requires that 0.0 is always in the [min; max] range. range_min = tf.minimum(tf.reduce_min(concat_tensors), 0.0, name='SafeQuantRangeMin') range_max = tf.maximum(tf.reduce_max(concat_tensors), 0.0, name='SafeQuantRangeMax') # Otherwise we need to keep track of the moving averages of the min and # of the elements of the input tensor max. min_val = moving_averages.assign_moving_average( min_var, range_min, ema_decay, name='AssignMinEma') max_val = moving_averages.assign_moving_average( max_var, range_max, ema_decay, name='AssignMaxEma') tf.logging.info('min_val: {}'.format(min_val)) tf.logging.info('max_val: {}'.format(max_val)) quant_inputs = [ tf.fake_quant_with_min_max_vars(t, min_val, max_val) for t in inputs ] tf.logging.info('quant_inputs: {}'.format(quant_inputs)) outputs = tf.concat(quant_inputs, axis=axis) tf.logging.info('outputs: {}'.format(outputs)) else: outputs = tf.concat(inputs, axis=axis) return outputs
def model(inputs, is_training): """Creation of the model graph.""" with tf.variable_scope(name, 'resnet_model'): inputs = resnet_model.fixed_padding(inputs, kernel_size=3, data_format=data_format) padding = 'VALID' kernel_initializer = tf.variance_scaling_initializer() kernel_regularizer = contrib_layers.l2_regularizer(weight_decay) inputs = tf.layers.conv2d(inputs=inputs, filters=_make_divisible(32 * width), kernel_size=3, strides=2, padding=padding, use_bias=False, kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer, data_format=data_format, name='initial_conv') inputs = tf.identity(inputs, 'initial_conv') inputs = resnet_model.batch_norm_relu(inputs, is_training, data_format=data_format) inverted_res_block = functools.partial( inverted_res_block_, is_training=is_training, width=width, expansion_factor=expansion_factor, pruning_method=pruning_method, data_format=data_format, weight_decay=weight_decay) inputs = inverted_res_block(inputs, filters=16, stride=1, block_id=0) inputs = inverted_res_block(inputs, filters=24, stride=2, block_id=1) inputs = inverted_res_block(inputs, filters=24, stride=1, block_id=2) inputs = inverted_res_block(inputs, filters=32, stride=2, block_id=3) inputs = inverted_res_block(inputs, filters=32, stride=1, block_id=4) inputs = inverted_res_block(inputs, filters=32, stride=1, block_id=5) inputs = inverted_res_block(inputs, filters=64, stride=2, block_id=6) inputs = inverted_res_block(inputs, filters=64, stride=1, block_id=7) inputs = inverted_res_block(inputs, filters=64, stride=1, block_id=8) inputs = inverted_res_block(inputs, filters=64, stride=1, block_id=9) inputs = inverted_res_block(inputs, filters=96, stride=1, block_id=10) inputs = inverted_res_block(inputs, filters=96, stride=1, block_id=11) inputs = inverted_res_block(inputs, filters=96, stride=1, block_id=12) inputs = inverted_res_block(inputs, filters=160, stride=2, block_id=13) inputs = inverted_res_block(inputs, filters=160, stride=1, block_id=14) inputs = inverted_res_block(inputs, filters=160, stride=1, block_id=15) inputs = inverted_res_block(inputs, filters=320, stride=1, block_id=16) last_block_filters = max(1280, _make_divisible(1280 * width, 8)) inputs = conv2d_fixed_padding(inputs=inputs, filters=last_block_filters, kernel_size=1, strides=1, pruning_method=pruning_method, data_format=data_format, weight_decay=weight_decay, name='final_1x1_conv') inputs = resnet_model.batch_norm_relu(inputs, is_training, data_format=data_format) if data_format == 'channels_last': pool_size = (inputs.shape[1], inputs.shape[2]) elif data_format == 'channels_first': pool_size = (inputs.shape[2], inputs.shape[3]) inputs = tf.layers.average_pooling2d(inputs=inputs, pool_size=pool_size, strides=1, padding='VALID', data_format=data_format, name='final_avg_pool') inputs = tf.identity(inputs, 'final_avg_pool') inputs = tf.reshape(inputs, [-1, last_block_filters]) kernel_initializer = tf.variance_scaling_initializer() kernel_regularizer = contrib_layers.l2_regularizer(weight_decay) if prune_last_layer: inputs = sparse_fully_connected( x=inputs, units=num_classes, sparsity_technique=pruning_method if prune_last_layer else 'baseline', kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer, name='final_dense') else: inputs = tf.layers.dense(inputs=inputs, units=num_classes, activation=None, use_bias=True, kernel_initializer=kernel_initializer, kernel_regularizer=kernel_regularizer, name='final_dense') inputs = tf.identity(inputs, 'final_dense') return inputs
def call(self, inputs, training=True, survival_prob=None): """Implementation of call(). Args: inputs: the inputs tensor. training: boolean, whether the model is constructed for training. survival_prob: float, between 0 to 1, drop connect rate. Returns: A output tensor. """ logging.info('Block input: %s shape: %s', inputs.name, inputs.shape) logging.info('Block input depth: %s output depth: %s', self._block_args.input_filters, self._block_args.output_filters) x = inputs fused_conv_fn = self._fused_conv expand_conv_fn = self._expand_conv depthwise_conv_fn = self._depthwise_conv project_conv_fn = self._project_conv if self._block_args.condconv: pooled_inputs = self._avg_pooling(inputs) routing_weights = self._routing_fn(pooled_inputs) # Capture routing weights as additional input to CondConv layers fused_conv_fn = functools.partial( self._fused_conv, routing_weights=routing_weights) expand_conv_fn = functools.partial( self._expand_conv, routing_weights=routing_weights) depthwise_conv_fn = functools.partial( self._depthwise_conv, routing_weights=routing_weights) project_conv_fn = functools.partial( self._project_conv, routing_weights=routing_weights) # creates conv 2x2 kernel if self._block_args.space2depth == 1: with tf.variable_scope('space2depth'): x = self._relu_fn( self._bnsp(self._space2depth(x), training=training)) logging.info( 'Block start with space2depth: %s shape: %s', x.name, x.shape) if self._block_args.fused_conv: # If use fused mbconv, skip expansion and use regular conv. x = self._relu_fn(self._bn1(fused_conv_fn(x), training=training)) logging.info('Conv2D: %s shape: %s', x.name, x.shape) else: # Otherwise, first apply expansion and then apply depthwise conv. if self._block_args.expand_ratio != 1: x = self._relu_fn(self._bn0(expand_conv_fn(x), training=training)) logging.info('Expand: %s shape: %s', x.name, x.shape) x = self._relu_fn(self._bn1(depthwise_conv_fn(x), training=training)) logging.info('DWConv: %s shape: %s', x.name, x.shape) if self._has_se: with tf.variable_scope('se'): x = self._call_se(x) self.endpoints = {'expansion_output': x} x = self._bn2(project_conv_fn(x), training=training) # Add identity so that quantization-aware training can insert quantization # ops correctly. x = tf.identity(x) if self._clip_projection_output: x = tf.clip_by_value(x, -6, 6) if self._block_args.id_skip: if all( s == 1 for s in self._block_args.strides ) and inputs.get_shape().as_list()[-1] == x.get_shape().as_list()[-1]: # Apply only if skip connection presents. if survival_prob: x = utils.drop_connect(x, training, survival_prob) x = tf.add(x, inputs) logging.info('Project: %s shape: %s', x.name, x.shape) return x
def inception_v3_base(inputs, final_endpoint='Mixed_7c', min_depth=16, depth_multiplier=1.0, scope=None): """Inception model from http://arxiv.org/abs/1512.00567. Constructs an Inception v3 network from inputs to the given final endpoint. This method can construct the network up to the final inception block Mixed_7c. Note that the names of the layers in the paper do not correspond to the names of the endpoints registered by this function although they build the same network. Here is a mapping from the old_names to the new names: Old name | New name ======================================= conv0 | Conv2d_1a_3x3 conv1 | Conv2d_2a_3x3 conv2 | Conv2d_2b_3x3 pool1 | MaxPool_3a_3x3 conv3 | Conv2d_3b_1x1 conv4 | Conv2d_4a_3x3 pool2 | MaxPool_5a_3x3 mixed_35x35x256a | Mixed_5b mixed_35x35x288a | Mixed_5c mixed_35x35x288b | Mixed_5d mixed_17x17x768a | Mixed_6a mixed_17x17x768b | Mixed_6b mixed_17x17x768c | Mixed_6c mixed_17x17x768d | Mixed_6d mixed_17x17x768e | Mixed_6e mixed_8x8x1280a | Mixed_7a mixed_8x8x2048a | Mixed_7b mixed_8x8x2048b | Mixed_7c Args: inputs: a tensor of size [batch_size, height, width, channels]. final_endpoint: specifies the endpoint to construct the network up to. It can be one of ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c']. min_depth: Minimum depth value (number of channels) for all convolution ops. Enforced when depth_multiplier < 1, and not an active constraint when depth_multiplier >= 1. depth_multiplier: Float multiplier for the depth (number of channels) for all convolution ops. The value must be greater than zero. Typical usage will be to set this value in (0, 1) to reduce the number of parameters or computation cost of the model. scope: Optional variable_scope. Returns: tensor_out: output tensor corresponding to the final_endpoint. end_points: a set of activations for external use, for example summaries or losses. Raises: ValueError: if final_endpoint is not set to one of the predefined values, or depth_multiplier <= 0 """ # end_points will collect relevant activations for external use, for example # summaries or losses. end_points = {} if depth_multiplier <= 0: raise ValueError('depth_multiplier is not greater than zero.') depth = lambda d: max(int(d * depth_multiplier), min_depth) with tf.variable_scope(scope, 'InceptionV3', [inputs]): with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='VALID'): # 299 x 299 x 3 end_point = 'Conv2d_1a_3x3' net = slim.conv2d(inputs, depth(32), [3, 3], stride=2, scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 149 x 149 x 32 end_point = 'Conv2d_2a_3x3' net = slim.conv2d(net, depth(32), [3, 3], scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 147 x 147 x 32 end_point = 'Conv2d_2b_3x3' net = slim.conv2d(net, depth(64), [3, 3], padding='SAME', scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 147 x 147 x 64 end_point = 'MaxPool_3a_3x3' net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 73 x 73 x 64 end_point = 'Conv2d_3b_1x1' net = slim.conv2d(net, depth(80), [1, 1], scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 73 x 73 x 80. end_point = 'Conv2d_4a_3x3' net = slim.conv2d(net, depth(192), [3, 3], scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 71 x 71 x 192. end_point = 'MaxPool_5a_3x3' net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # 35 x 35 x 192. # Inception blocks with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME'): # mixed: 35 x 35 x 256. end_point = 'Mixed_5b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], scope='Conv2d_0b_5x5') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(32), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( axis=3, values=[branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_1: 35 x 35 x 288. end_point = 'Mixed_5c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0b_1x1') branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], scope='Conv_1_0c_5x5') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(64), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( axis=3, values=[branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_2: 35 x 35 x 288. end_point = 'Mixed_5d' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], scope='Conv2d_0b_5x5') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], scope='Conv2d_0c_3x3') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(64), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( axis=3, values=[branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_3: 17 x 17 x 768. end_point = 'Mixed_6a' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(384), [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], scope='Conv2d_0b_3x3') branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_1x1') with tf.variable_scope('Branch_2'): branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='MaxPool_1a_3x3') net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed4: 17 x 17 x 768. end_point = 'Mixed_6b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(128), [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], scope='Conv2d_0c_7x1') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(128), [7, 1], scope='Conv2d_0b_7x1') branch_2 = slim.conv2d(branch_2, depth(128), [1, 7], scope='Conv2d_0c_1x7') branch_2 = slim.conv2d(branch_2, depth(128), [7, 1], scope='Conv2d_0d_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], scope='Conv2d_0e_1x7') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( axis=3, values=[branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_5: 17 x 17 x 768. end_point = 'Mixed_6c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(160), [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], scope='Conv2d_0c_7x1') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], scope='Conv2d_0b_7x1') branch_2 = slim.conv2d(branch_2, depth(160), [1, 7], scope='Conv2d_0c_1x7') branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], scope='Conv2d_0d_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], scope='Conv2d_0e_1x7') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( axis=3, values=[branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_6: 17 x 17 x 768. end_point = 'Mixed_6d' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(160), [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], scope='Conv2d_0c_7x1') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], scope='Conv2d_0b_7x1') branch_2 = slim.conv2d(branch_2, depth(160), [1, 7], scope='Conv2d_0c_1x7') branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], scope='Conv2d_0d_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], scope='Conv2d_0e_1x7') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( axis=3, values=[branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_7: 17 x 17 x 768. end_point = 'Mixed_6e' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(192), [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], scope='Conv2d_0c_7x1') with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(192), [7, 1], scope='Conv2d_0b_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], scope='Conv2d_0c_1x7') branch_2 = slim.conv2d(branch_2, depth(192), [7, 1], scope='Conv2d_0d_7x1') branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], scope='Conv2d_0e_1x7') with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( axis=3, values=[branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_8: 8 x 8 x 1280. end_point = 'Mixed_7a' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') branch_0 = slim.conv2d(branch_0, depth(320), [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') branch_1 = slim.conv2d(branch_1, depth(192), [1, 7], scope='Conv2d_0b_1x7') branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], scope='Conv2d_0c_7x1') branch_1 = slim.conv2d(branch_1, depth(192), [3, 3], stride=2, padding='VALID', scope='Conv2d_1a_3x3') with tf.variable_scope('Branch_2'): branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='MaxPool_1a_3x3') net = tf.concat(axis=3, values=[branch_0, branch_1, branch_2]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_9: 8 x 8 x 2048. end_point = 'Mixed_7b' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1') branch_1 = tf.concat(axis=3, values=[ slim.conv2d( branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'), slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0b_3x1') ]) with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3') branch_2 = tf.concat(axis=3, values=[ slim.conv2d( branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'), slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1') ]) with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( axis=3, values=[branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points # mixed_10: 8 x 8 x 2048. end_point = 'Mixed_7c' with tf.variable_scope(end_point): with tf.variable_scope('Branch_0'): branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1') with tf.variable_scope('Branch_1'): branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1') branch_1 = tf.concat(axis=3, values=[ slim.conv2d( branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'), slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0c_3x1') ]) with tf.variable_scope('Branch_2'): branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1') branch_2 = slim.conv2d(branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3') branch_2 = tf.concat(axis=3, values=[ slim.conv2d( branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'), slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1') ]) with tf.variable_scope('Branch_3'): branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') net = tf.concat( axis=3, values=[branch_0, branch_1, branch_2, branch_3]) end_points[end_point] = net if end_point == final_endpoint: return net, end_points raise ValueError('Unknown final endpoint %s' % final_endpoint)
def inception_v3(inputs, num_classes=1000, is_training=True, dropout_keep_prob=0.8, min_depth=16, depth_multiplier=1.0, prediction_fn=slim.softmax, spatial_squeeze=True, reuse=None, create_aux_logits=True, scope='InceptionV3', global_pool=False): """Inception model from http://arxiv.org/abs/1512.00567. "Rethinking the Inception Architecture for Computer Vision" Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, Zbigniew Wojna. With the default arguments this method constructs the exact model defined in the paper. However, one can experiment with variations of the inception_v3 network by changing arguments dropout_keep_prob, min_depth and depth_multiplier. The default image size used to train this network is 299x299. Args: inputs: a tensor of size [batch_size, height, width, channels]. num_classes: number of predicted classes. If 0 or None, the logits layer is omitted and the input features to the logits layer (before dropout) are returned instead. is_training: whether is training or not. dropout_keep_prob: the percentage of activation values that are retained. min_depth: Minimum depth value (number of channels) for all convolution ops. Enforced when depth_multiplier < 1, and not an active constraint when depth_multiplier >= 1. depth_multiplier: Float multiplier for the depth (number of channels) for all convolution ops. The value must be greater than zero. Typical usage will be to set this value in (0, 1) to reduce the number of parameters or computation cost of the model. prediction_fn: a function to get predictions out of logits. spatial_squeeze: if True, logits is of shape [B, C], if false logits is of shape [B, 1, 1, C], where B is batch_size and C is number of classes. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. create_aux_logits: Whether to create the auxiliary logits. scope: Optional variable_scope. global_pool: Optional boolean flag to control the avgpooling before the logits layer. If false or unset, pooling is done with a fixed window that reduces default-sized inputs to 1x1, while larger inputs lead to larger outputs. If true, any input size is pooled down to 1x1. Returns: net: a Tensor with the logits (pre-softmax activations) if num_classes is a non-zero integer, or the non-dropped-out input to the logits layer if num_classes is 0 or None. end_points: a dictionary from components of the network to the corresponding activation. Raises: ValueError: if 'depth_multiplier' is less than or equal to zero. """ if depth_multiplier <= 0: raise ValueError('depth_multiplier is not greater than zero.') depth = lambda d: max(int(d * depth_multiplier), min_depth) with tf.variable_scope(scope, 'InceptionV3', [inputs], reuse=reuse) as scope: with slim.arg_scope([slim.batch_norm, slim.dropout], is_training=is_training): net, end_points = inception_v3_base( inputs, scope=scope, min_depth=min_depth, depth_multiplier=depth_multiplier) # Auxiliary Head logits if create_aux_logits and num_classes: with slim.arg_scope( [slim.conv2d, slim.max_pool2d, slim.avg_pool2d], stride=1, padding='SAME'): aux_logits = end_points['Mixed_6e'] with tf.variable_scope('AuxLogits'): aux_logits = slim.avg_pool2d(aux_logits, [5, 5], stride=3, padding='VALID', scope='AvgPool_1a_5x5') aux_logits = slim.conv2d(aux_logits, depth(128), [1, 1], scope='Conv2d_1b_1x1') # Shape of feature map before the final layer. kernel_size = _reduced_kernel_size_for_small_input( aux_logits, [5, 5]) aux_logits = slim.conv2d( aux_logits, depth(768), kernel_size, weights_initializer=trunc_normal(0.01), padding='VALID', scope='Conv2d_2a_{}x{}'.format(*kernel_size)) aux_logits = slim.conv2d( aux_logits, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, weights_initializer=trunc_normal(0.001), scope='Conv2d_2b_1x1') if spatial_squeeze: aux_logits = tf.squeeze(aux_logits, [1, 2], name='SpatialSqueeze') end_points['AuxLogits'] = aux_logits # Final pooling and prediction with tf.variable_scope('Logits'): if global_pool: # Global average pooling. net = tf.reduce_mean(input_tensor=net, axis=[1, 2], keepdims=True, name='GlobalPool') end_points['global_pool'] = net else: # Pooling with a fixed kernel size. kernel_size = _reduced_kernel_size_for_small_input( net, [8, 8]) net = slim.avg_pool2d( net, kernel_size, padding='VALID', scope='AvgPool_1a_{}x{}'.format(*kernel_size)) end_points['AvgPool_1a'] = net if not num_classes: return net, end_points # 1 x 1 x 2048 net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b') end_points['PreLogits'] = net # 2048 logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='Conv2d_1c_1x1') if spatial_squeeze: logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') # 1000 end_points['Logits'] = logits end_points['Predictions'] = prediction_fn(logits, scope='Predictions') return logits, end_points
def transformer_model( input_tensor, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False, ): """Multi-headed, multi-layer Transformer from "Attention is All You Need". This is almost an exact implementation of the original Transformer encoder. See the original paper: https://arxiv.org/abs/1706.03762 Also see: https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], with 1 for positions that can be attended to and 0 in positions that should not be. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers (blocks) in the Transformer. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the "intermediate" (a.k.a., feed forward) layer. intermediate_act_fn: function. The non-linear activation function to apply to the output of the intermediate/feed-forward layer. hidden_dropout_prob: float. Dropout probability for the hidden layers. attention_probs_dropout_prob: float. Dropout probability of the attention probabilities. initializer_range: float. Range of the initializer (stddev of truncated normal). do_return_all_layers: Whether to also return all layers or just the final layer. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. Raises: ValueError: A Tensor shape or parameter is invalid. """ if hidden_size % num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, num_attention_heads) ) attention_head_size = int(hidden_size / num_attention_heads) input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] input_width = input_shape[2] # The Transformer performs sum residuals on all layers so the input needs # to be the same as the hidden size. if input_width != hidden_size: raise ValueError( "The width of the input tensor (%d) != hidden size (%d)" % (input_width, hidden_size) ) # We keep the representation as a 2D tensor to avoid re-shaping it back and # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on # the GPU/CPU but may not be free on the TPU, so we want to minimize them to # help the optimizer. prev_output = reshape_to_matrix(input_tensor) attn_maps = [] all_layer_outputs = [] for layer_idx in range(num_hidden_layers): with tf.variable_scope("layer_%d" % layer_idx): with tf.variable_scope("attention"): attention_heads = [] with tf.variable_scope("self"): attention_head, probs = attention_layer( from_tensor=prev_output, to_tensor=prev_output, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length, ) attention_heads.append(attention_head) attn_maps.append(probs) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=create_initializer(initializer_range), ) attention_output = dropout(attention_output, hidden_dropout_prob) attention_output = layer_norm(attention_output + prev_output) # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("intermediate"): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=create_initializer(initializer_range), ) # Down-project back to `hidden_size` then add the residual. with tf.variable_scope("output"): prev_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=create_initializer(initializer_range), ) prev_output = dropout(prev_output, hidden_dropout_prob) prev_output = layer_norm(prev_output + attention_output) all_layer_outputs.append(prev_output) attn_maps = tf.stack(attn_maps, 0) if do_return_all_layers: return ( tf.stack( [ reshape_from_matrix(layer, input_shape) for layer in all_layer_outputs ], 0, ), attn_maps, ) else: return reshape_from_matrix(prev_output, input_shape), attn_maps
def __init__( self, bert_config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, scope=None, embedding_size=None, input_embeddings=None, input_reprs=None, update_embeddings=True, untied_embeddings=False, ): """Constructor for BertModel. Args: bert_config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, it is much faster if this is True, on the CPU or GPU, it is faster if this is False. scope: (optional) variable scope. Defaults to "electra". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ bert_config = copy.deepcopy(bert_config) if not is_training: bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 input_shape = get_shape_list(token_type_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) assert token_type_ids is not None if input_reprs is None: if input_embeddings is None: with tf.variable_scope( (scope if untied_embeddings else "electra") + "/embeddings", reuse=tf.AUTO_REUSE, ): # Perform embedding lookup on the word ids if embedding_size is None: embedding_size = bert_config.hidden_size (self.token_embeddings, self.embedding_table) = embedding_lookup( input_ids=input_ids, vocab_size=bert_config.vocab_size, embedding_size=embedding_size, initializer_range=bert_config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings, ) else: self.token_embeddings = input_embeddings with tf.variable_scope( (scope if untied_embeddings else "electra") + "/embeddings", reuse=tf.AUTO_REUSE, ): # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = embedding_postprocessor( input_tensor=self.token_embeddings, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=bert_config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=bert_config.initializer_range, max_position_embeddings=bert_config.max_position_embeddings, dropout_prob=bert_config.hidden_dropout_prob, ) else: self.embedding_output = input_reprs if not update_embeddings: self.embedding_output = tf.stop_gradient(self.embedding_output) with tf.variable_scope(scope, default_name="electra"): if self.embedding_output.shape[-1] != bert_config.hidden_size: self.embedding_output = tf.layers.dense( self.embedding_output, bert_config.hidden_size, name="embeddings_project", ) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = create_attention_mask_from_input_mask( token_type_ids, input_mask ) # Run the stacked transformer. Output shapes # sequence_output: [batch_size, seq_length, hidden_size] # pooled_output: [batch_size, hidden_size] # all_encoder_layers: [n_layers, batch_size, seq_length, hidden_size]. # attn_maps: [n_layers, batch_size, n_heads, seq_length, seq_length] (self.all_layer_outputs, self.attn_maps) = transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=get_activation(bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, do_return_all_layers=True, ) self.sequence_output = self.all_layer_outputs[-1] self.pooled_output = self.sequence_output[:, 0]
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" tf.logging.info("*** Features ***") for name in sorted(features.keys()): tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] label_ids = features["label_ids"] if "is_real_example" in features: is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) else: is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) is_training = (mode == tf.estimator.ModeKeys.TRAIN) # Create model with aux loss model = GroverModel( config=config, is_training=is_training, input_ids=input_ids, pad_token_id=config.pad_token_id, chop_off_last_token=False, ) with tf.variable_scope('classification'): hidden_state = model.pooled_output(pool_token_id) if is_training: hidden_state = dropout(hidden_state, dropout_prob=0.1) logits = tf.layers.dense( hidden_state, num_labels, kernel_initializer=create_initializer(config.initializer_range), name='logits' ) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=num_labels, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) class_loss = tf.reduce_mean(per_example_loss) total_loss = lm_loss_coef * model.lm_loss() + class_loss if is_training: train_op, train_metrics = optimization_adafactor.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) # tvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) tvars = tf.trainable_variables() train_metrics['minibatch_cls_loss'] = class_loss train_metrics['minibatch_acc'] = tf.reduce_mean( tf.cast(tf.equal(tf.argmax(logits, axis=-1, output_type=tf.int32), label_ids), tf.float32)) else: train_op = None train_metrics = {} tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None if init_checkpoint: (assignment_map, initialized_variable_names ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: def tpu_scaffold(): tf.train.init_from_checkpoint(init_checkpoint, assignment_map) return tf.train.Scaffold() scaffold_fn = tpu_scaffold else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: if use_tpu: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, host_call=construct_scalar_host_call(metric_dict=train_metrics, model_dir=params['model_dir'], prefix='training/'), scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, training_hooks=[ tf.train.LoggingTensorHook({'loss': tf.metrics.mean(total_loss)[1]}, every_n_iter=100)], scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: def metric_fn(per_example_loss, label_ids, logits, is_real_example): predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) accuracy = tf.metrics.accuracy( labels=label_ids, predictions=predictions, weights=is_real_example) loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) return { "eval_accuracy": accuracy, "eval_loss": loss, } eval_metrics = (metric_fn, [per_example_loss, label_ids, logits, is_real_example]) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={'logits': logits, 'probs': tf.nn.softmax(logits, axis=-1)}, scaffold_fn=scaffold_fn) return output_spec
def __init__(self, item_num, args, reuse=None): self.args = args self.is_training = tf.placeholder(tf.bool, shape=()) self.input_seq = tf.placeholder(tf.int32, shape=(None, args.maxlen)) self.pos = tf.placeholder(tf.int32, shape=None) self.exemplar_logits = tf.placeholder(tf.float32, shape=(None, None)) self.exemplar_pos = tf.placeholder(tf.int32, shape=None) self.max_item = tf.placeholder(tf.int32, shape=()) self.lr = tf.placeholder(tf.float32, shape=()) self.dropout_rate = tf.placeholder(tf.float32, shape=()) pos = self.pos mask = tf.expand_dims(tf.to_float(tf.not_equal(self.input_seq, 0)), -1) with tf.variable_scope("SASRec", reuse=reuse): # sequence embedding, item embedding table self.seq, item_emb_table = embedding(self.input_seq, vocab_size=item_num + 1, num_units=args.hidden_units, zero_pad=True, scale=True, l2_reg=args.l2_emb, scope="input_embeddings", with_t=True, reuse=reuse ) # # Positional Encoding t, pos_emb_table = embedding( tf.tile(tf.expand_dims(tf.range(tf.shape(self.input_seq)[1]), 0), [tf.shape(self.input_seq)[0], 1]), vocab_size=args.maxlen, num_units=args.hidden_units, zero_pad=False, scale=False, l2_reg=args.l2_emb, scope="dec_pos", reuse=reuse, with_t=True ) self.seq += t # Dropout self.seq = tf.layers.dropout(self.seq, rate=self.dropout_rate, training=tf.convert_to_tensor(self.is_training), seed=args.random_seed) self.seq *= mask # Build blocks for i in range(args.num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=args.hidden_units, num_heads=args.num_heads, dropout_rate=self.dropout_rate, seed=args.random_seed, is_training=self.is_training, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[args.hidden_units, args.hidden_units], dropout_rate=self.dropout_rate, is_training=self.is_training, seed=args.random_seed) self.seq *= mask self.seq = normalize(self.seq) # find representation self.rep = self.seq[:, -1, :] # define loss seq_emb = tf.reshape(self.rep, [tf.shape(self.input_seq)[0], args.hidden_units]) indices = pos - 1 self.labels = tf.one_hot(indices, self.max_item) item_emb = tf.nn.embedding_lookup(item_emb_table, tf.range(1, self.max_item + 1)) self.logits = tf.matmul(seq_emb, tf.transpose(item_emb)) self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=self.labels, logits=self.logits)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr) # prediction self.test_item = tf.placeholder(tf.int32, shape=None) self.test_item_emb = tf.nn.embedding_lookup(item_emb_table, self.test_item) self.test_logits = tf.matmul(seq_emb, tf.transpose(self.test_item_emb)) self.test_logits = tf.reshape(self.test_logits, [tf.shape(self.input_seq)[0], tf.shape(self.test_item)[0]]) self.pred_last = tf.argsort(tf.argsort(-self.test_logits))
def call(self, inputs, training=True, features_only=None, pooled_features_only=False): """Implementation of call(). Args: inputs: input tensors. training: boolean, whether the model is constructed for training. features_only: build the base feature network only. pooled_features_only: build the base network for features extraction (after 1x1 conv layer and global pooling, but before dropout and fc head). Returns: output tensors. """ outputs = None self.endpoints = {} reduction_idx = 0 # Calls Stem layers with tf.variable_scope('stem'): outputs = self._relu_fn( self._bn0(self._conv_stem(inputs), training=training)) logging.info('Built stem layers with output shape: %s', outputs.shape) self.endpoints['stem'] = outputs # Calls blocks. for idx, block in enumerate(self._blocks): is_reduction = False # reduction flag for blocks after the stem layer # If the first block has space-to-depth layer, then stem is # the first reduction point. if (block.block_args().space2depth == 1 and idx == 0): reduction_idx += 1 self.endpoints['reduction_%s' % reduction_idx] = outputs elif ((idx == len(self._blocks) - 1) or self._blocks[idx + 1].block_args().strides[0] > 1): is_reduction = True reduction_idx += 1 with tf.variable_scope('blocks_%s' % idx): survival_prob = self._global_params.survival_prob if survival_prob: drop_rate = 1.0 - survival_prob survival_prob = 1.0 - drop_rate * float(idx) / len(self._blocks) logging.info('block_%s survival_prob: %s', idx, survival_prob) outputs = block.call( outputs, training=training, survival_prob=survival_prob) self.endpoints['block_%s' % idx] = outputs if is_reduction: self.endpoints['reduction_%s' % reduction_idx] = outputs if block.endpoints: for k, v in six.iteritems(block.endpoints): self.endpoints['block_%s/%s' % (idx, k)] = v if is_reduction: self.endpoints['reduction_%s/%s' % (reduction_idx, k)] = v self.endpoints['features'] = outputs if not features_only: # Calls final layers and returns logits. with tf.variable_scope('head'): outputs = self._relu_fn( self._bn1(self._conv_head(outputs), training=training)) self.endpoints['head_1x1'] = outputs if self._global_params.local_pooling: shape = outputs.get_shape().as_list() kernel_size = [ 1, shape[self._spatial_dims[0]], shape[self._spatial_dims[1]], 1] outputs = tf.nn.avg_pool( outputs, ksize=kernel_size, strides=[1, 1, 1, 1], padding='VALID') self.endpoints['pooled_features'] = outputs if not pooled_features_only: if self._dropout: outputs = self._dropout(outputs, training=training) self.endpoints['global_pool'] = outputs if self._fc: outputs = tf.squeeze(outputs, self._spatial_dims) outputs = self._fc(outputs) self.endpoints['head'] = outputs else: outputs = self._avg_pooling(outputs) self.endpoints['pooled_features'] = outputs if not pooled_features_only: if self._dropout: outputs = self._dropout(outputs, training=training) self.endpoints['global_pool'] = outputs if self._fc: outputs = self._fc(outputs) self.endpoints['head'] = outputs return outputs
def __init__( self, config, is_training, input_ids, attention_mask=None, token_type_ids=None, return_pool=True, scope=None, reuse=False, compute_type=tf.float32 ): super().__init__(config, is_training) input_shape = model_utils.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if attention_mask is None: attention_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int64) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int64) with tf.variable_scope( scope, default_name="bert", reuse=tf.AUTO_REUSE if reuse else None, custom_getter=model_utils.get_custom_getter(compute_type)): with tf.variable_scope("embeddings"): self.embedding_output, self.embedding_table = bert_embedding( config=self.config, input_ids=input_ids, token_type_ids=token_type_ids, add_position_embedding=True ) with tf.variable_scope("encoder"): attention_mask = model_utils.create_bert_mask( input_ids, attention_mask) if model_utils.get_shape_list(self.embedding_output)[-1] != self.config.hidden_size: self.embedding_output = layers.dense( self.embedding_output, self.config.hidden_size, 'embedding_hidden_mapping_in', initializer_range=self.config.initializer_range ) encoder_outputs = bert_encoder( input_tensor=tf.saturate_cast(self.embedding_output, compute_type), attention_mask=attention_mask, config=self.config, use_relative_position=False ) if return_pool: with tf.variable_scope("pooler"): pooled_output = layers.pooler_layer( sequence_output=encoder_outputs[0], hidden_size=self.config.hidden_size, initializer_range=self.config.initializer_range ) else: pooled_output = None # (pooled output, sequence output, all layer outputs, all layer att probs) self.outputs = (pooled_output,) + encoder_outputs
def __init__(self, *, policy, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None, np_mask=None): self.sess = sess = get_session() with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE): # CREATE OUR TWO MODELS # act_model that is used for sampling act_model = policy(nbatch_act, 1, sess, np_mask=np_mask, is_act_model=True) # Train model for training if microbatch_size is None: train_model = policy(nbatch_train, nsteps, sess, np_mask=np_mask, is_act_model=False) else: train_model = policy(microbatch_size, nsteps, sess, np_mask=np_mask, is_act_model=False) # CREATE THE PLACEHOLDERS self.A = A = train_model.pdtype.sample_placeholder([None]) self.ADV = ADV = tf.placeholder(tf.float32, [None]) self.R = R = tf.placeholder(tf.float32, [None]) # Keep track of old actor self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) # Keep track of old critic self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None]) self.LR = LR = tf.placeholder(tf.float32, []) # Cliprange self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) # Calculate the entropy # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy. entropy = tf.reduce_mean(train_model.pd.entropy()) # CALCULATE THE LOSS # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss # Clip the value to reduce variability during Critic training # Get the predicted value vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) # Unclipped value vf_losses1 = tf.square(vpred - R) # Clipped value vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) # Calculate ratio (pi current policy / pi old policy) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) # Defining Loss = - J is equivalent to max J pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) # Final PG loss pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) # Total loss loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef # UPDATE THE PARAMETERS USING LOSS # 1. Get the model parameters params = tf.trainable_variables('ppo2_model') # 2. Build our trainer self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) # 3. Calculate the gradients grads_and_var = self.trainer.compute_gradients(loss, params) grads, var = zip(*grads_and_var) if max_grad_norm is not None: # Clip the gradients (normalize) grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads_and_var = list(zip(grads, var)) # zip aggregate each gradient with parameters associated # For instance zip(ABCD, xyza) => Ax, By, Cz, Da self.grads = grads self.var = var self._train_op = self.trainer.apply_gradients(grads_and_var) self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac] self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state initialize()
## The construction phase zeros = tf.reshape(tf.convert_to_tensor(np.zeros(x.shape)), shape=(-1, 1)) x = tf.reshape(tf.convert_to_tensor(x), shape=(-1, 1)) t = tf.reshape(tf.convert_to_tensor(t), shape=(-1, 1)) points = tf.concat([x, t], 1) num_iter = 10000 num_hidden_neurons = [20, 20, 20] X = tf.convert_to_tensor(X) T = tf.convert_to_tensor(T) with tf.variable_scope('dnn'): num_hidden_layers = np.size(num_hidden_neurons) previous_layer = points for l in range(num_hidden_layers): current_layer = tf.layers.dense(previous_layer, num_hidden_neurons[l], activation=tf.nn.sigmoid) previous_layer = current_layer dnn_output = tf.layers.dense(previous_layer, 1) def initial_conditions(x): return tf.sin(np.pi * x)
def multihead_attention(queries, keys, times=None, num_units=None, num_heads=1, dropout_rate=0, is_training=True, use_prior="none", causality=True, scope="multihead_attention", residual=False, time_exp_base=None, overlapping_chunks=None, reuse=None, with_qk=False): """Applies multihead attention. Args: queries: A 3d tensor with shape of [N, T_q, C_q]. keys: A 3d tensor with shape of [N, T_k, C_k]. times: A 3d tensor with shape of [N, T_q, T_k]. num_units: A scalar. Attention size. num_heads: An int. Number of heads. dropout_rate: A floating point number. is_training: Boolean. Controller of mechanism for dropout. use_prior: String. Whether to use prior for attention heads. Supported values include: none, position. causality: Boolean. If true, units that reference the future are masked. scope: Optional scope for `variable_scope`. residual: Boolean. Whether to use residual connection. time_exp_base: A scalar. Base for exponential time intervals. Only used for the case where use_prior='time'. overlapping_chunks: Boolean. Whether to use (non)/overlapping chunks for the case where use_prior='time'. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns A 3d tensor with shape of (N, T_q, C) with_qk: Whether to use qk. Returns: Output of multihead attention. """ tf.logging.info( "Computing attention with prior: {} and num of heads: {}".format( use_prior, num_heads)) with tf.variable_scope(scope, reuse=reuse): # Set the fall back option for num_units if num_units is None: num_units = queries.get_shape().as_list[-1] # pylint: disable=invalid-name # Linear projections # Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu) # K = tf.layers.dense(keys, num_units, activation=tf.nn.relu) # V = tf.layers.dense(keys, num_units, activation=tf.nn.relu) Q = tf.layers.dense(queries, num_units, activation=None) # (N, T_q, C) K = tf.layers.dense(keys, num_units, activation=None) # (N, T_k, C) V = tf.layers.dense(keys, num_units, activation=None) # (N, T_k, C) # Split and concat Q_ = tf.concat(tf.split(Q, num_heads, axis=2), axis=0) # (h*N, T_q, C/h) K_ = tf.concat(tf.split(K, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) V_ = tf.concat(tf.split(V, num_heads, axis=2), axis=0) # (h*N, T_k, C/h) # pylint: enable=invalid-name # Multiplication outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # (h*N, T_q, T_k) # Scale outputs = outputs / (K_.get_shape().as_list()[-1]**0.5) # Key Masking key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1))) # (N, T_k) key_masks = tf.tile(key_masks, [num_heads, 1]) # (h*N, T_k) key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(outputs) * (-2**32 + 1) outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs) # (h*N, T_q, T_k) # Causality = Future blinding if causality: diag_vals = tf.ones_like(outputs[0, :, :]) # (T_q, T_k) tril = tf.linalg.LinearOperatorLowerTriangular( diag_vals).to_dense() # (T_q, T_k) masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1]) # (h*N, T_q, T_k) paddings = tf.ones_like(masks) * (-2**32 + 1) outputs = tf.where(tf.equal(masks, 0), paddings, outputs) # (h*N, T_q, T_k) # Position/Time prior is only used in multi-head case. if num_heads > 1: # Scaling head weights with position prior. if use_prior == "position": # Each head focuses on a window of items whose size is computed below. attn_size = int(outputs.get_shape().as_list()[-1] / num_heads) outputs = tf.concat(_compute_head_weights_with_position_prior( outputs, masks, paddings, num_heads, attn_size), axis=0) # (H*N, T_q, T_k) tf.logging.info( "After position-wise sliding window attention.") tf.logging.info(outputs.shape) # Scaling head weights with time prior. elif use_prior == "time": # Convert time deltas from seconds to days. if times is None: raise ValueError("Times tensor is needed.") time_deltas = _compute_time_deltas(times) / SECS_TO_DAYS outputs = tf.concat(_compute_head_weights_with_time_prior( outputs, paddings, time_deltas, num_heads, time_exp_base, overlapping_chunks), axis=0) # (H*N, T_q, T_k) # Activation outputs = tf.nn.softmax(outputs) # (h*N, T_q, T_k) # Query Masking query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1))) # (N, T_q) query_masks = tf.tile(query_masks, [num_heads, 1]) # (h*N, T_q) query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]]) # (h*N, T_q, T_k) outputs *= query_masks # broadcasting. (h*N, T_q, C) # Dropouts outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training)) # Weighted sum outputs = tf.matmul(outputs, V_) # (h*N, T_q, C/h) # Restore shape outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2) # (N, T_q, C) # Residual connection if residual: outputs += queries if with_qk: return Q, K else: return outputs
def build_fc_densenet(inputs, num_classes, preset_model='FC-DenseNet56', n_filters_first_conv=48, n_pool=5, growth_rate=12, n_layers_per_block=4, dropout_p=0.2, scope=None, is_training=True): """ Builds the FC-DenseNet model Arguments: inputs: the input tensor preset_model: The model you want to use n_classes: number of classes n_filters_first_conv: number of filters for the first convolution applied n_pool: number of pooling layers = number of transition down = number of transition up growth_rate: number of new feature maps created by each layer in a dense block n_layers_per_block: number of layers per block. Can be an int or a list of size 2 * n_pool + 1 dropout_p: dropout rate applied after each convolution (0. for not using) Returns: Fc-DenseNet model """ if not is_training: #No dropout when predicting dropout_p = 0 if preset_model == 'FC-DenseNet56': n_pool = 5 growth_rate = 12 n_layers_per_block = 4 elif preset_model == 'FC-DenseNet67': n_pool = 5 growth_rate = 16 n_layers_per_block = 5 elif preset_model == 'FC-DenseNet103': n_pool = 5 growth_rate = 16 n_layers_per_block = [4, 5, 7, 10, 12, 15, 12, 10, 7, 5, 4] else: raise ValueError( "Unsupported FC-DenseNet model '%s'. This function only supports FC-DenseNet56, FC-DenseNet67, and FC-DenseNet103" % (preset_model)) if type(n_layers_per_block) == list: assert (len(n_layers_per_block) == 2 * n_pool + 1) elif type(n_layers_per_block) == int: n_layers_per_block = [n_layers_per_block] * (2 * n_pool + 1) else: raise ValueError with tf.variable_scope(scope, preset_model, [inputs]) as sc: ##################### # First Convolution # ##################### # We perform a first convolution. stack = slim.conv2d(inputs, n_filters_first_conv, [3, 3], scope='first_conv', activation_fn=None) n_filters = n_filters_first_conv ##################### # Downsampling path # ##################### skip_connection_list = [] for i in range(n_pool): # Dense Block stack, _ = DenseBlock(stack, n_layers_per_block[i], growth_rate, dropout_p, scope='denseblock%d' % (i + 1)) n_filters += growth_rate * n_layers_per_block[i] # At the end of the dense block, the current stack is stored in the skip_connections list skip_connection_list.append(stack) # Transition Down stack = TransitionDown(stack, n_filters, dropout_p, scope='transitiondown%d' % (i + 1)) skip_connection_list = skip_connection_list[::-1] ##################### # Bottleneck # ##################### # Dense Block # We will only upsample the new feature maps stack, block_to_upsample = DenseBlock(stack, n_layers_per_block[n_pool], growth_rate, dropout_p, scope='denseblock%d' % (n_pool + 1)) ####################### # Upsampling path # ####################### for i in range(n_pool): # Transition Up ( Upsampling + concatenation with the skip connection) n_filters_keep = growth_rate * n_layers_per_block[n_pool + i] stack = TransitionUp(block_to_upsample, skip_connection_list[i], n_filters_keep, scope='transitionup%d' % (n_pool + i + 1)) # Dense Block # We will only upsample the new feature maps stack, block_to_upsample = DenseBlock( stack, n_layers_per_block[n_pool + i + 1], growth_rate, dropout_p, scope='denseblock%d' % (n_pool + i + 2)) ##################### # Softmax # ##################### net = slim.conv2d(stack, num_classes, [1, 1], activation_fn=None, scope='logits') return net
def embedding(inputs, vocab_size, num_units, zero_pad=True, scale=True, l2_reg=0.0, scope="embedding", with_t=False, reuse=None): """Embeds a given tensor. Args: inputs: A `Tensor` with type `int32` or `int64` containing the ids to be looked up in `lookup table`. vocab_size: An int. Vocabulary size. num_units: An int. Number of embedding hidden units. zero_pad: A boolean. If True, all the values of the fist row (id 0) should be constant zeros. scale: A boolean. If True. the outputs is multiplied by sqrt num_units. l2_reg: L2 regularization weight. scope: Optional scope for `variable_scope`. with_t: If True, return the embedding table. reuse: Boolean, whether to reuse the weights of a previous layer by the same name. Returns: A `Tensor` with one more rank than inputs's. The last dimensionality should be `num_units`. For example, ``` import tensorflow as tf inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) outputs = embedding(inputs, 6, 2, zero_pad=True) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print sess.run(outputs) >> [[[ 0. 0. ] [ 0.09754146 0.67385566] [ 0.37864095 -0.35689294]] [[-1.01329422 -1.09939694] [ 0.7521342 0.38203377] [-0.04973143 -0.06210355]]] ``` ``` import tensorflow as tf inputs = tf.to_int32(tf.reshape(tf.range(2*3), (2, 3))) outputs = embedding(inputs, 6, 2, zero_pad=False) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print sess.run(outputs) >> [[[-0.19172323 -0.39159766] [-0.43212751 -0.66207761] [ 1.03452027 -0.26704335]] [[-0.11634696 -0.35983452] [ 0.50208133 0.53509563] [ 1.22204471 -0.96587461]]] ``` """ with tf.variable_scope(scope, reuse=reuse): lookup_table = tf.get_variable( "lookup_table", dtype=tf.float32, shape=[vocab_size, num_units], # initializer=tf.contrib.layers.xavier_initializer(), regularizer=tf.keras.regularizers.l2(l2_reg)) if zero_pad: lookup_table = tf.concat( (tf.zeros(shape=[1, num_units]), lookup_table[1:, :]), 0) outputs = tf.nn.embedding_lookup(lookup_table, inputs) if scale: outputs = outputs * (num_units**0.5) if with_t: return outputs, lookup_table else: return outputs
def find_trainable_variables(key): with tf.variable_scope(key): return tf.trainable_variables()
def build_feature_network(features, config): """Build FPN input features. Args: features: input tensor. config: a dict-like config, including all parameters. Returns: A dict from levels to the feature maps processed after feature network. """ feats = [] if config.min_level not in features.keys(): raise ValueError( 'features.keys ({}) should include min_level ({})'.format( features.keys(), config.min_level)) # Build additional input features that are not from backbone. for level in range(config.min_level, config.max_level + 1): if level in features.keys(): feats.append(features[level]) else: # Adds a coarser level by downsampling the last feature map. feats.append( resample_feature_map( feats[-1], name='p%d' % level, target_width=feats[-1].shape[1] // 2, target_num_channels=config.fpn_num_filters, apply_bn=config.apply_bn_for_resampling, is_training=config.is_training_bn, conv_after_downsample=config.conv_after_downsample, use_native_resize_op=config.use_native_resize_op, pooling_type=config.pooling_type)) _verify_feats_size(feats, input_size=config.image_size, min_level=config.min_level, max_level=config.max_level) with tf.variable_scope('fpn_cells'): for rep in range(config.fpn_cell_repeats): with tf.variable_scope('cell_{}'.format(rep)): logging.info('building cell %d', rep) new_feats = build_bifpn_layer( feats=feats, fpn_name=config.fpn_name, fpn_config=config.fpn_config, input_size=config.image_size, fpn_num_filters=config.fpn_num_filters, min_level=config.min_level, max_level=config.max_level, separable_conv=config.separable_conv, is_training=config.is_training_bn, apply_bn_for_resampling=config.apply_bn_for_resampling, conv_after_downsample=config.conv_after_downsample, use_native_resize_op=config.use_native_resize_op, conv_bn_relu_pattern=config.conv_bn_relu_pattern, pooling_type=config.pooling_type) feats = [ new_feats[level] for level in range(config.min_level, config.max_level + 1) ] _verify_feats_size(feats, input_size=config.image_size, min_level=config.min_level, max_level=config.max_level) return new_feats
def construct_model(self, input_tensors=None, prefix='metatrain_', test_num_updates=0): """a: training data for inner gradient, b: test data for meta gradient.""" self.inputa = input_tensors['inputa'] self.inputb = input_tensors['inputb'] self.labela = input_tensors['labela'] self.labelb = input_tensors['labelb'] with tf.variable_scope('model', reuse=None) as training_scope: if 'weights' in dir(self): training_scope.reuse_variables() weights = self.weights else: # Define the weights self.weights = weights = self.construct_weights() # outputbs[i] and lossesb[i] is the output and loss after i+1 gradient # updates num_updates = max(test_num_updates, FLAGS.num_updates) def task_metalearn(inp, reuse=True): """Run meta learning.""" TRAIN = 'train' in prefix # pylint: disable=invalid-name # Perform gradient descent for one task in the meta-batch. inputa, inputb, labela, labelb = inp task_outputbs, task_lossesb = [], [] task_msesb = [] # support_pred and loss, (n_data_per_task, out_dim) task_outputa = self.forward( inputa, weights, reuse=reuse) # only not reuse on the first iter # labela is (n_data_per_task, out_dim) task_lossa = self.loss_func(task_outputa, labela) # INNER LOOP (no change with ib) grads = tf.gradients(task_lossa, list(weights.values())) if FLAGS.stop_grad: grads = [tf.stop_gradient(grad) for grad in grads] gradients = dict(zip(weights.keys(), grads)) # theta_pi = theta - alpha * grads fast_weights = dict( zip(weights.keys(), [ weights[key] - self.update_lr * gradients[key] for key in weights.keys() ])) # use theta_pi to forward meta-test output = self.forward(inputb, weights, reuse=True) task_outputbs.append(output) # meta-test loss task_kl_loss = sum(self.encoder_w.losses) task_msesb.append(self.loss_func(output, labelb)) task_lossesb.append( self.loss_func(output, labelb) + self.beta * task_kl_loss) def while_body(fast_weights_values): """Update params.""" loss = self.loss_func( self.forward( inputa, dict(zip(fast_weights.keys(), fast_weights_values)), reuse=True), labela) grads = tf.gradients(loss, fast_weights_values) fast_weights_values = [ v - self.update_lr * g for v, g in zip(fast_weights_values, grads) ] return fast_weights_values fast_weights_values = tf.while_loop( lambda _: True, while_body, loop_vars=[fast_weights.values()], maximum_iterations=num_updates - 1, back_prop=TRAIN) fast_weights = dict(zip(fast_weights.keys(), fast_weights_values)) output = self.forward(inputb, fast_weights, reuse=True) task_outputbs.append(output) task_msesb.append(self.loss_func(output, labelb)) task_lossesb.append( self.loss_func(output, labelb) + self.beta * task_kl_loss) task_output = [ task_outputa, task_outputbs, task_lossa, task_lossesb, task_msesb ] return task_output if FLAGS.norm is not None: # to initialize the batch norm vars, might want to combine this, and # not run idx 0 twice. _ = task_metalearn( (self.inputa[0], self.inputb[0], self.labela[0], self.labelb[0]), False) out_dtype = [ tf.float32, [tf.float32] * 2, tf.float32, [tf.float32] * 2, [tf.float32] * 2 ] result = tf.map_fn(task_metalearn, elems=(self.inputa, self.inputb, \ self.labela, self.labelb), dtype=out_dtype, \ parallel_iterations=FLAGS.meta_batch_size) outputas, outputbs, lossesa, lossesb, msesb = result ## Performance & Optimization if 'train' in prefix: # lossesa is length(meta_batch_size) self.total_loss1 = tf.reduce_sum(lossesa) / tf.to_float( FLAGS.meta_batch_size) self.total_losses2 = total_losses2 = [ tf.reduce_sum(msesb[j]) / tf.to_float(FLAGS.meta_batch_size) for j in range(len(msesb)) ] self.total_losses3 = total_losses3 = [ tf.reduce_sum(lossesb[j]) / tf.to_float(FLAGS.meta_batch_size) for j in range(len(lossesb)) ] # after the map_fn self.outputas, self.outputbs = outputas, outputbs # OUTER LOOP if FLAGS.metatrain_iterations > 0: optimizer = tf.train.AdamOptimizer(self.meta_lr) THETA = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model') # pylint: disable=invalid-name PHI = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='encoder') # pylint: disable=invalid-name self.gvs_theta = gvs_theta = optimizer.compute_gradients( self.total_losses2[-1], THETA) metatrain_theta_op = optimizer.apply_gradients(gvs_theta) self.gvs_phi = gvs_phi = optimizer.compute_gradients( self.total_losses3[-1], PHI) metatrain_phi_op = optimizer.apply_gradients(gvs_phi) with tf.control_dependencies([metatrain_theta_op, metatrain_phi_op]): self.metatrain_op = tf.no_op() scale_v = [ v for v in self.encoder_w.trainable_variables if 'scale' in v.name ] scale_norm = [tf.reduce_mean(v) for v in scale_v] scale_norm = tf.reduce_mean(scale_norm) tf.summary.scalar(prefix + 'full_loss', total_losses3[-1]) tf.summary.scalar(prefix + 'regularizer', total_losses3[-1] - total_losses2[-1]) tf.summary.scalar(prefix + 'untransformed_scale', scale_norm) else: self.metaval_total_loss1 = tf.reduce_sum( lossesa) / tf.to_float(FLAGS.meta_batch_size) self.metaval_total_losses2 = total_losses2 = [ tf.reduce_sum(msesb[j]) / tf.to_float(FLAGS.meta_batch_size) for j in range(len(msesb)) ] self.metaval_total_losses3 = total_losses3 = [ tf.reduce_sum(lossesb[j]) / tf.to_float(FLAGS.meta_batch_size) for j in range(len(lossesb)) ] tf.summary.scalar(prefix + 'Pre-mse', total_losses2[0]) tf.summary.scalar(prefix + 'Post-mse_' + str(num_updates), total_losses2[-1])
def build_bifpn_layer(feats, fpn_name, fpn_config, is_training, input_size, fpn_num_filters, min_level, max_level, separable_conv, apply_bn_for_resampling, conv_after_downsample, use_native_resize_op, conv_bn_relu_pattern, pooling_type): """Builds a feature pyramid given previous feature pyramid and config.""" config = fpn_config or get_fpn_config(fpn_name) num_output_connections = [0 for _ in feats] for i, fnode in enumerate(config.nodes): with tf.variable_scope('fnode{}'.format(i)): logging.info('fnode %d : %s', i, fnode) new_node_width = int(fnode['width_ratio'] * input_size) nodes = [] for idx, input_offset in enumerate(fnode['inputs_offsets']): input_node = feats[input_offset] num_output_connections[input_offset] += 1 input_node = resample_feature_map( input_node, '{}_{}_{}'.format(idx, input_offset, len(feats)), new_node_width, fpn_num_filters, apply_bn_for_resampling, is_training, conv_after_downsample, use_native_resize_op, pooling_type) nodes.append(input_node) # Combine all nodes. dtype = nodes[0].dtype if config.weight_method == 'attn': edge_weights = [ tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype) for _ in range(len(fnode['inputs_offsets'])) ] normalized_weights = tf.nn.softmax(tf.stack(edge_weights)) nodes = tf.stack(nodes, axis=-1) new_node = tf.reduce_sum( tf.multiply(nodes, normalized_weights), -1) elif config.weight_method == 'fastattn': edge_weights = [ tf.nn.relu( tf.cast(tf.Variable(1.0, name='WSM'), dtype=dtype)) for _ in range(len(fnode['inputs_offsets'])) ] weights_sum = tf.add_n(edge_weights) nodes = [ nodes[i] * edge_weights[i] / (weights_sum + 0.0001) for i in range(len(nodes)) ] new_node = tf.add_n(nodes) elif config.weight_method == 'sum': new_node = tf.add_n(nodes) else: raise ValueError('unknown weight_method {}'.format( config.weight_method)) with tf.variable_scope('op_after_combine{}'.format(len(feats))): if not conv_bn_relu_pattern: new_node = utils.relu_fn(new_node) if separable_conv: conv_op = functools.partial(tf.layers.separable_conv2d, depth_multiplier=1) else: conv_op = tf.layers.conv2d new_node = conv_op( new_node, filters=fpn_num_filters, kernel_size=(3, 3), padding='same', use_bias=True if not conv_bn_relu_pattern else False, name='conv') new_node = utils.batch_norm_relu( new_node, is_training_bn=is_training, relu=False if not conv_bn_relu_pattern else True, data_format='channels_last', name='bn') feats.append(new_node) num_output_connections.append(0) output_feats = {} for l in range(min_level, max_level + 1): for i, fnode in enumerate(reversed(config.nodes)): if fnode['width_ratio'] == F(l): output_feats[l] = feats[-1 - i] break return output_feats
def _build_model(self): assert self.mode == 'train' or self.mode == 'eval' """Build the core model within the graph.""" with tf.variable_scope('input'): self.x_input = tf.placeholder(tf.float32, shape=[None, 32, 32, 3]) self.y_input = tf.placeholder(tf.int32, shape=None) input_standardized = tf.map_fn( lambda img: tf.image.per_image_standardization(img), self.x_input) x = self._conv('init_conv', input_standardized, 3, 3, 16, self._stride_arr(1)) strides = [1, 2, 2] activate_before_residual = [True, False, False] res_func = self._residual # Uncomment the following codes to use w28-10 wide residual network. # It is more memory efficient than very deep residual network and has # comparably good performance. # https://arxiv.org/pdf/1605.07146v1.pdf filters = [16, 160, 320, 640] # Update hps.num_residual_units to 9 with tf.variable_scope('unit_1_0'): x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]), activate_before_residual[0]) for i in range(1, 5): with tf.variable_scope('unit_1_%d' % i): x = res_func(x, filters[1], filters[1], self._stride_arr(1), False) with tf.variable_scope('unit_2_0'): x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]), activate_before_residual[1]) for i in range(1, 5): with tf.variable_scope('unit_2_%d' % i): x = res_func(x, filters[2], filters[2], self._stride_arr(1), False) with tf.variable_scope('unit_3_0'): x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]), activate_before_residual[2]) for i in range(1, 5): with tf.variable_scope('unit_3_%d' % i): x = res_func(x, filters[3], filters[3], self._stride_arr(1), False) with tf.variable_scope('unit_last'): x = self._batch_norm('final_bn', x) x = self._relu(x, 0.1) x = self._global_avg_pool(x) with tf.variable_scope('logit'): self.logits = self._fully_connected(x, 10) self.predictions = tf.argmax(self.logits, 1, output_type=tf.int32) self.correct_prediction = tf.equal(self.predictions, self.y_input) self.num_correct = tf.reduce_sum( tf.cast(self.correct_prediction, tf.int32)) self.accuracy = tf.reduce_mean( tf.cast(self.correct_prediction, tf.float32))
def resample_feature_map(feat, name, target_width, target_num_channels, apply_bn=False, is_training=None, conv_after_downsample=False, use_native_resize_op=False, pooling_type=None): """Resample input feature map to have target number of channels and width.""" _, width, _, num_channels = feat.get_shape().as_list() if width is None or num_channels is None: raise ValueError( 'shape[1] or shape[3] of feat is None (shape:{}).'.format( feat.shape)) if apply_bn and is_training is None: raise ValueError('If BN is applied, need to provide is_training') def _maybe_apply_1x1(feat): """Apply 1x1 conv to change layer width if necessary.""" if num_channels != target_num_channels: feat = tf.layers.conv2d(feat, filters=target_num_channels, kernel_size=(1, 1), padding='same') if apply_bn: feat = utils.batch_norm_relu(feat, is_training_bn=is_training, relu=False, data_format='channels_last', name='bn') return feat with tf.variable_scope('resample_{}'.format(name)): # If conv_after_downsample is True, when downsampling, apply 1x1 after # downsampling for efficiency. if width > target_width: if width % target_width != 0: raise ValueError('width ({}) is not divisible by ' 'target_width ({}).'.format( width, target_width)) if not conv_after_downsample: feat = _maybe_apply_1x1(feat) stride_size = int(width // target_width) if pooling_type == 'max' or pooling_type is None: # Use max pooling in default. feat = tf.layers.max_pooling2d( inputs=feat, pool_size=stride_size + 1, strides=[stride_size, stride_size], padding='SAME', data_format='channels_last') elif pooling_type == 'avg': feat = tf.layers.average_pooling2d( inputs=feat, pool_size=stride_size + 1, strides=[stride_size, stride_size], padding='SAME', data_format='channels_last') else: raise ValueError( 'Unknown pooling type: {}'.format(pooling_type)) if conv_after_downsample: feat = _maybe_apply_1x1(feat) else: if target_width % width != 0: raise ValueError('target_width ({}) is not divisible by ' 'width ({}).'.format(target_width, width)) feat = _maybe_apply_1x1(feat) if width < target_width: _, h, w, _ = feat.get_shape().as_list() scale = target_width // width if use_native_resize_op: feat = tf.image.resize_nearest_neighbor( feat, [h * scale, w * scale]) else: feat = nearest_upsampling(feat, scale=scale) return feat
def _get_action_logits(encoder_output, decoder_output, output_vocab_embeddings_table, output_vocab_size, model_config, input_copy_mask=None, use_gating_mechanism=True): """Generate output logits given decoder output. This effectively combines a Pointer Network (Vinyals et al., 2015) with a standard softmax output layer for selecting symbols from an output vocabulary, similar to: - Jia and Liang, 2016 (https://arxiv.org/abs/1606.03622) - Gulcehre et al., 2016 (https://arxiv.org/abs/1603.08148) - Gu et al., 2016 (https://arxiv.org/abs/1603.06393) - See et al. 2017 (https://arxiv.org/abs/1704.04368) Args: encoder_output: Tensor representing encoder output of shape (batch size, input length, encoder dims). decoder_output: Tensor representing decoder output of shape (batch size, # decoded steps, decoder dims). output_vocab_embeddings_table: Embeddings for output vocabulary of shape (output_vocab_size, target embedding dims). output_vocab_size: Integer size of output_vocab_embeddings_table outer dim. model_config: ModelConfig proto. input_copy_mask: Mask of the input sequence for copying. use_gating_mechanism: Whether to use gating mechanism. Returns: Tensor of shape (batch_size, output_vocab_size + input length) representing unnormalized logits for both copy and generate actions. """ with tf.variable_scope("logits_transforms"): decoder_dims = decoder_output.get_shape()[-1] target_embedding_dims = model_config.model_parameters.target_embedding_dims # Dot product the decoder output with representations of each of the output # symbols to get a set of unnormalized logits for each output vocab item. # We need to tile the output vocab embeddings across the batch. output_vocab_transform = tf.expand_dims(output_vocab_embeddings_table, 0) batch_size = tf.shape(decoder_output)[0] output_vocab_transform = tf.tile(output_vocab_transform, [batch_size, 1, 1]) # Transform representations to the target_embedding_dims. if decoder_dims != target_embedding_dims: transformed_decoder_output = common_layers.linear_transform( decoder_output, target_embedding_dims, "decoder_transform") else: transformed_decoder_output = decoder_output generate_logits = tf.matmul(transformed_decoder_output, output_vocab_transform, transpose_b=True) generate_logits_bias = tf.get_variable("generate_logits_bias", shape=(output_vocab_size)) generate_logits += generate_logits_bias # Dot product the decoder output with representations from the encoder # output. # This is necessary vs. re-using the encoder-decoder attention weights # because those use multihead attention. # First, need to transform representations to the decoder dimensions. transformed_encoder_output = common_layers.linear_transform( encoder_output, decoder_dims, "encoder_transform") copy_logits = tf.matmul(decoder_output, transformed_encoder_output, transpose_b=True) # This contains scores representing the probability of copying from input # (3rd dim) to output (2nd dim). # Optionally apply a soft gating mechanism to determine whether # to select from copy or generate logits. # TODO(petershaw): Evaluate and improve this gating mechanism. # The current implementation is most likely not optimal, since it applies # a scalar in the range [0,1] prior to softmax. if use_gating_mechanism: prob_gen_unnormalized = common_layers.linear_transform( decoder_output, 1, "prob_gen") prob_gen_bias = tf.get_variable("prob_gen_bias", shape=(1)) prob_gen_unnormalized += prob_gen_bias prob_gen = tf.sigmoid(prob_gen_unnormalized) # Squeeze so that prob_gen has shape [batch_size, decode_length] prob_gen = tf.squeeze(prob_gen, axis=2) # These are the 'generate' logits so are scaled by P_gen. generate_logits *= tf.expand_dims(prob_gen, axis=-1) # These are the 'copy' logits so are scaled by 1 - P_gen. copy_logits *= tf.expand_dims(1 - prob_gen, axis=-1) if input_copy_mask is not None: copy_mask = (1 - tf.dtypes.cast( input_copy_mask, dtype=tf.dtypes.float32)) * LOGIT_MASK_VALUE copy_logits += tf.expand_dims(copy_mask, axis=1) # Concatenate logits into a single vector; first N (fixed) inputs are the # generation probabilities, and next are the copy probabilities for each # input (well, they aren't really probabilities, but scores.) extended_logits = tf.concat([generate_logits, copy_logits], axis=2) return extended_logits
def conv_capsule_mat(input_tensor, input_activation, input_dim, output_dim, layer_name, num_routing=3, num_in_atoms=3, num_out_atoms=3, stride=2, kernel_size=5, min_var=0.0005, final_beta=1.0): """Convolutional Capsule layer with Pose Matrices.""" print('caps conv stride: {}'.format(stride)) in_atom_sq = num_in_atoms * num_in_atoms with tf.variable_scope(layer_name): input_shape = tf.shape(input_tensor) _, _, _, in_height, in_width = input_tensor.get_shape() # This Variable will hold the state of the weights for the layer kernel = utils.weight_variable(shape=[ input_dim, kernel_size, kernel_size, num_in_atoms, output_dim * num_out_atoms ], stddev=0.3) # kernel = tf.clip_by_norm(kernel, 3.0, axes=[1, 2, 3]) activation_biases = utils.bias_variable( [1, 1, output_dim, 1, 1, 1, 1, 1], init_value=0.5, name='activation_biases') sigma_biases = utils.bias_variable([1, 1, output_dim, 1, 1, 1, 1, 1], init_value=.5, name='sigma_biases') with tf.name_scope('conv'): print('convi;') # input_tensor: [x,128,8, c1,c2] -> [x*128,8, c1,c2] print(input_tensor.get_shape()) input_tensor_reshaped = tf.reshape(input_tensor, [ input_shape[0] * input_dim * in_atom_sq, input_shape[3], input_shape[4], 1 ]) input_tensor_reshaped.set_shape((None, input_tensor.get_shape()[3], input_tensor.get_shape()[4], 1)) input_act_reshaped = tf.reshape(input_activation, [ input_shape[0] * input_dim, input_shape[3], input_shape[4], 1 ]) input_act_reshaped.set_shape((None, input_tensor.get_shape()[3], input_tensor.get_shape()[4], 1)) print(input_tensor_reshaped.get_shape()) # conv: [x*128,out*out_at, c3,c4] conv_patches = tf.extract_image_patches( images=input_tensor_reshaped, ksizes=[1, kernel_size, kernel_size, 1], strides=[1, stride, stride, 1], rates=[1, 1, 1, 1], padding='VALID', ) act_patches = tf.extract_image_patches( images=input_act_reshaped, ksizes=[1, kernel_size, kernel_size, 1], strides=[1, stride, stride, 1], rates=[1, 1, 1, 1], padding='VALID', ) o_height = (in_height - kernel_size) // stride + 1 o_width = (in_width - kernel_size) // stride + 1 patches = tf.reshape(conv_patches, (input_shape[0], input_dim, in_atom_sq, o_height, o_width, kernel_size, kernel_size)) patches.set_shape((None, input_dim, in_atom_sq, o_height, o_width, kernel_size, kernel_size)) patch_trans = tf.transpose(patches, [1, 5, 6, 0, 3, 4, 2]) patch_split = tf.reshape( patch_trans, (input_dim, kernel_size, kernel_size, input_shape[0] * o_height * o_width * num_in_atoms, num_in_atoms)) patch_split.set_shape( (input_dim, kernel_size, kernel_size, None, num_in_atoms)) a_patches = tf.reshape(act_patches, (input_shape[0], input_dim, 1, 1, o_height, o_width, kernel_size, kernel_size)) a_patches.set_shape((None, input_dim, 1, 1, o_height, o_width, kernel_size, kernel_size)) with tf.name_scope('input_act'): utils.activation_summary( tf.reduce_sum(tf.reduce_sum(tf.reduce_sum(a_patches, axis=1), axis=-1), axis=-1)) with tf.name_scope('Wx'): wx = tf.matmul(patch_split, kernel) wx = tf.reshape(wx, (input_dim, kernel_size, kernel_size, input_shape[0], o_height, o_width, num_in_atoms * num_out_atoms, output_dim)) wx.set_shape( (input_dim, kernel_size, kernel_size, None, o_height, o_width, num_in_atoms * num_out_atoms, output_dim)) wx = tf.transpose(wx, [3, 0, 7, 6, 4, 5, 1, 2]) utils.activation_summary(wx) with tf.name_scope('routing'): # Routing # logits: [x, 128, 10, c3, c4] logit_shape = [ input_dim, output_dim, 1, o_height, o_width, kernel_size, kernel_size ] activation, center = update_conv_routing( wx=wx, input_activation=a_patches, activation_biases=activation_biases, sigma_biases=sigma_biases, logit_shape=logit_shape, num_out_atoms=num_out_atoms * num_out_atoms, input_dim=input_dim, num_routing=num_routing, output_dim=output_dim, min_var=min_var, final_beta=final_beta, ) # activations: [x, 10, 8, c3, c4] out_activation = tf.squeeze(activation, axis=[1, 3, 6, 7]) out_center = tf.squeeze(center, axis=[1, 6, 7]) with tf.name_scope('center'): utils.activation_summary(out_center) return tf.sigmoid(out_activation), out_center