def normalize(inp, activation, reuse, scope): if FLAGS.norm == 'batch_norm': return tf_layers.batch_norm(inp, activation_fn=activation, reuse=reuse, scope=scope) elif FLAGS.norm == 'layer_norm': return tf_layers.layer_norm(inp, activation_fn=activation, reuse=reuse, scope=scope) elif FLAGS.norm == 'None': if activation is not None: return activation(inp) else: return inp
def encoder_model(frames, sequence_length, initializer, keep_prob_dropout=0.9, scope='encoder', fc_conv_layer=False): """ Args: frames: 5D array of batch with videos - shape(batch_size, num_frames, frame_width, frame_higth, num_channels) sequence_length: number of frames that shall be encoded scope: tensorflow variable scope name initializer: specifies the initialization type (default: contrib.slim.layers uses Xavier init with uniform data) fc_conv_layer: adds an fc layer at the end of the encoder Returns: hidden4: hidden state of highest ConvLSTM layer fc_conv_layer: indicated whether a Fully Convolutional (8x8x16 -> 1x1x1024) shall be added """ lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5, lstm_state6 = None, None, None, None, None, None for i in range(sequence_length): frame = frames[:,i,:,:,:] reuse = (i > 0) with tf.variable_scope(scope, reuse=reuse): #LAYER 1: conv1 conv1 = slim.layers.conv2d(frame, 32, [5, 5], stride=2, scope='conv1', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, normalizer_params={'scope': 'layer_norm1'}) conv1 = tf.nn.dropout(conv1, keep_prob_dropout) #LAYER 2: convLSTM1 hidden1, lstm_state1 = basic_conv_lstm_cell(conv1, lstm_state1, 32, initializer, filter_size=5, scope='convlstm1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') hidden1 = tf.nn.dropout(hidden1, keep_prob_dropout) #LAYER 3: conv2 conv2 = slim.layers.conv2d(hidden1, hidden1.get_shape()[3], [5, 5], stride=2, scope='conv2', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, normalizer_params={'scope': 'layer_norm3'}) conv2 = tf.nn.dropout(conv2, keep_prob_dropout) #LAYER 4: convLSTM2 hidden2, lstm_state2 = basic_conv_lstm_cell(conv2, lstm_state2, 32, initializer, filter_size=5, scope='convlstm2') hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm4') hidden2 = tf.nn.dropout(hidden2, keep_prob_dropout) #LAYER 5: conv3 conv3 = slim.layers.conv2d(hidden2, hidden2.get_shape()[3], [5, 5], stride=2, scope='conv3', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, normalizer_params={'scope': 'layer_norm5'}) conv3 = tf.nn.dropout(conv3, keep_prob_dropout) #LAYER 6: convLSTM3 hidden3, lstm_state3 = basic_conv_lstm_cell(conv3, lstm_state3, 32, initializer, filter_size=3, scope='convlstm3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm6') hidden3 = tf.nn.dropout(hidden3, keep_prob_dropout) #LAYER 7: conv4 conv4 = slim.layers.conv2d(hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv4', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, normalizer_params={'scope': 'layer_norm7'}) conv4 = tf.nn.dropout(conv4, keep_prob_dropout) #LAYER 8: convLSTM4 (8x8 feature map size) hidden4, lstm_state4 = basic_conv_lstm_cell(conv4, lstm_state4, 64, initializer, filter_size=3, scope='convlstm4') hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm8') hidden4 = tf.nn.dropout(hidden4, keep_prob_dropout) #LAYER 8: conv5 conv5 = slim.layers.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv5', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, normalizer_params={'scope': 'layer_norm9'}) conv5 = tf.nn.dropout(conv5, keep_prob_dropout) # LAYER 9: convLSTM5 (4x4 feature map size) hidden5, lstm_state5 = basic_conv_lstm_cell(conv5, lstm_state5, 64, initializer, filter_size=3, scope='convlstm5') hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm10') hidden5 = tf.nn.dropout(hidden5, keep_prob_dropout) # LAYER 10: Fully Convolutional Layer (4x4x128 --> 1x1xFC_LAYER_SIZE) # necessary for dimension compatibility with conv lstm cell fc_conv = slim.layers.conv2d(hidden5, FC_LAYER_SIZE, [4,4], stride=1, scope='fc_conv', padding='VALID', weights_initializer=initializer) fc_conv = tf.nn.dropout(fc_conv, keep_prob_dropout) # LAYER 11: Fully Convolutional LSTM (1x1x256 -> 1x1x128) hidden6, lstm_state6 = basic_conv_lstm_cell(fc_conv, lstm_state6, FC_LSTM_LAYER_SIZE, initializer, filter_size=1, scope='convlstm6') #no dropout since its the last encoder layer --> hidden repr should be steady # mu and sigma for sampling latent variable sigma = slim.layers.fully_connected(inputs=lstm_state6, num_outputs=VAE_REPR_SIZE, activation_fn=tf.nn.softplus) mu = slim.layers.fully_connected(inputs=lstm_state6, num_outputs=VAE_REPR_SIZE, activation_fn=None) # do reparamazerization trick to allow backprop flow through deterministic nodes sigma and mu z = mu + sigma * tf.random_normal(tf.shape(mu), mean=0., stddev=1.) return z, mu, sigma
def _norm(self, inp, scope=None): reuse = tf.get_variable_scope().reuse with vs.variable_scope(scope or "norm") as scope: normalized = layer_norm(inp, reuse=reuse, scope=scope) return normalized
def inference(images, scope='RNN'): #train network with tf.name_scope(scope, [images]): images=image_slicer(images) reuse = None #======================# #Scale 1: coarse level # #======================# #======Output size: 96x128x64 #Layer 1: conv1=cnv.conv(images,'conv1',[3, 3, 3, 64],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu1= tf.nn.leaky_relu(conv1,alpha=0.1) relu1 = tf_layers.layer_norm(relu1, scope='layer_norm1',reuse=reuse) #Layer 2: conv2=cnv.conv(relu1,'conv2',[5, 5, 64, 64],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu2=tf.nn.leaky_relu(conv2,alpha=0.1) relu2 = tf_layers.layer_norm(relu2, scope='layer_norm2',reuse=reuse) #======Output size: 48x64x128 #Layer 3 conv3=cnv.conv(relu2,'conv3',[3, 3, 64, 128],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu3=tf.nn.leaky_relu(conv3,alpha=0.1) relu3 = tf_layers.layer_norm(relu3, scope='layer_norm3',reuse=reuse) conv4=cnv.conv(relu3,'conv4',[5, 5, 128, 128],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu4=tf.nn.leaky_relu(conv4,alpha=0.1) relu4 = tf_layers.layer_norm(relu4, scope='layer_norm4',reuse=reuse) #======Output size: 24x32x256 #Layer 5 conv5=cnv.conv(relu4,'conv5',[3, 3, 128, 256],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu5=tf.nn.leaky_relu(conv5,alpha=0.1) relu5 = tf_layers.layer_norm(relu5, scope='layer_norm5',reuse=reuse) #Layer 6 conv6=cnv.conv(relu5,'conv6',[5, 5, 256, 256],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu6=tf.nn.leaky_relu(conv6,alpha=0.1) relu6 = tf_layers.layer_norm(relu6, scope='layer_norm6',reuse=reuse) #======Output size: 12x16x512 #Layer 7 conv7=cnv.conv(relu6,'conv7',[3, 3, 256, 512],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu7=tf.nn.leaky_relu(conv7,alpha=0.1) relu7 = tf_layers.layer_norm(relu7, scope='layer_norm7',reuse=reuse) #Layer 8 conv8=cnv.conv(relu7,'conv8',[5, 5, 512, 512],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu8=tf.nn.leaky_relu(conv8,alpha=0.1) relu8 = tf_layers.layer_norm(relu8, scope='layer_norm8',reuse=reuse) #======Output size: 6x8x512 #Layer 9 conv9=cnv.conv(relu8,'conv9',[3, 3, 512, 512],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu9=tf.nn.leaky_relu(conv9,alpha=0.1) relu9 = tf_layers.layer_norm(relu9, scope='layer_norm9',reuse=reuse) #upasmpling #======Output size: 12x16x512 #Layer 10 conv10=dcnv.deconv(relu9,[BATCH_SIZE,int(IMAGE_SIZE_H/16),int(IMAGE_SIZE_W/16),512],'d_conv10',[4, 4, 512, 512],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu10=tf.nn.leaky_relu(conv10,alpha=0.1) relu10 = tf_layers.layer_norm(relu10, scope='layer_norm10',reuse=reuse) #Layer 11 conv11=cnv.conv(relu10+relu8,'conv11',[5, 5, 512, 512],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu11=tf.nn.leaky_relu(conv11,alpha=0.1) relu11 = tf_layers.layer_norm(relu11, scope='layer_norm11',reuse=reuse) #======Output size: 24x32x256 #Layer 12 conv12=dcnv.deconv(relu11,[BATCH_SIZE,int(IMAGE_SIZE_H/8),int(IMAGE_SIZE_W/8),256],'d_conv12',[4, 4, 256, 512],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu12=tf.nn.leaky_relu(conv12,alpha=0.1) relu12 = tf_layers.layer_norm(relu12, scope='layer_norm12',reuse=reuse) #Layer 13 conv13=cnv.conv(relu12+relu6,'conv13',[5, 5, 256, 256],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu13=tf.nn.leaky_relu(conv13,alpha=0.1) relu13 = tf_layers.layer_norm(relu13, scope='layer_norm13',reuse=reuse) #======Output size: 48x64x256 #Layer 14 conv14=dcnv.deconv(relu13,[BATCH_SIZE,int(IMAGE_SIZE_H/4),int(IMAGE_SIZE_W/4),128],'d_conv14',[4, 4, 128, 256],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu14=tf.nn.leaky_relu(conv14,alpha=0.1) relu14 = tf_layers.layer_norm(relu14, scope='layer_norm14',reuse=reuse) #Layer 15 conv15=cnv.conv(relu14+relu4,'conv15',[3, 3, 128, 128],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu15=tf.nn.leaky_relu(conv15,alpha=0.1) relu15 = tf_layers.layer_norm(relu15, scope='layer_norm15',reuse=reuse) #===================== output depth scale 1: 48x64x1 /4 out_scale1=cnv.conv(relu15,'out_scale1',[3, 3, 128, 4],stride=[1,1,1, 1],padding='SAME',wd=0,FLOAT16=FLOAT16,reuse=reuse) #======================# #Scale 2: middle level # #======================# #======Output size: 96x128x64 #Layer 17: conv17=cnv.conv(images,'conv17',[3, 3, 3, 64],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu17=tf.nn.leaky_relu(conv17,alpha=0.1) relu17 = tf_layers.layer_norm(relu17, scope='layer_norm17',reuse=reuse) #Layer 18: conv18=cnv.conv(relu17+relu1,'conv18',[5, 5, 64, 64],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu18=tf.nn.leaky_relu(conv18,alpha=0.1) relu18 = tf_layers.layer_norm(relu18, scope='layer_norm18',reuse=reuse) #Layer 19:48x64x128 conv19=cnv.conv(relu18,'conv19',[3, 3, 64, 128],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu19=tf.nn.leaky_relu(conv19,alpha=0.1) relu19 = tf_layers.layer_norm(relu19, scope='layer_norm19',reuse=reuse) #Layer 20: conv20=cnv.conv(relu19+relu3,'conv20',[5, 5, 128, 128],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu20=tf.nn.leaky_relu(conv20,alpha=0.1) relu20 = tf_layers.layer_norm(relu20, scope='layer_norm20',reuse=reuse) #concatenate featuremap from coarse level concat1= tf.concat([relu20,relu15,out_scale1], 3, name='concat1') #Layer 21: conv21=cnv.conv(concat1,'conv21',[9, 9, 260, 256],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu21=tf.nn.leaky_relu(conv21,alpha=0.1) relu21 = tf_layers.layer_norm(relu21, scope='layer_norm21',reuse=reuse) #upsampling #Layer 22 conv22=dcnv.deconv(relu21,[BATCH_SIZE,int(IMAGE_SIZE_H/4),int(IMAGE_SIZE_W/4),128],'d_conv22',[4, 4, 128, 256],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu22=tf.nn.leaky_relu(conv22,alpha=0.1) relu22 = tf_layers.layer_norm(relu22, scope='layer_norm22',reuse=reuse) #Layer 23: conv23=cnv.conv(relu22+relu20,'conv23',[5, 5, 128, 128],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu23=tf.nn.leaky_relu(conv23,alpha=0.1) relu23 = tf_layers.layer_norm(relu23, scope='layer_norm23',reuse=reuse) #Layer 24: conv24=dcnv.deconv(relu23,[BATCH_SIZE,int(IMAGE_SIZE_H/2),int(IMAGE_SIZE_W/2),64],'d_conv24',[4, 4, 64, 128],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu24=tf.nn.leaky_relu(conv24,alpha=0.1) relu24 = tf_layers.layer_norm(relu24, scope='layer_norm24',reuse=reuse) conv25=cnv.conv(relu24,'conv25',[5, 5, 64, 64],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu25=tf.nn.leaky_relu(conv25,alpha=0.1) relu25 = tf_layers.layer_norm(relu25, scope='layer_norm25',reuse=reuse) #===================== output depth scale 2: 96x128x1 /4 out_scale2=cnv.conv(relu25,'out_scale2',[3, 3, 64, 4],stride=[1,1,1, 1],padding='SAME',wd=0,FLOAT16=FLOAT16,reuse=reuse) #======================# #Scale 3: fine level # #======================# #======Output size: 192x256x32 #Layer 27: conv27=cnv.conv(images,'conv27',[3, 3, 3, 32],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu27=tf.nn.leaky_relu(conv27,alpha=0.1) relu27 = tf_layers.layer_norm(relu27, scope='layer_norm27',reuse=reuse) #Layer 28: conv28=cnv.conv(relu27,'conv28',[3, 3, 32, 64],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu28=tf.nn.leaky_relu(conv28,alpha=0.1) relu28 = tf_layers.layer_norm(relu28, scope='layer_norm28',reuse=reuse) #Layer 29: conv29=cnv.conv(relu28+relu17,'conv29',[5, 5, 64, 64],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu29=tf.nn.leaky_relu(conv29,alpha=0.1) relu29 = tf_layers.layer_norm(relu29, scope='layer_norm29',reuse=reuse) #concatenate featuremap from middle leveil concat2= tf.concat([relu29,relu25,out_scale2], 3, name='concat2') #Layer 30: conv30=cnv.conv(concat2,'conv30',[5, 5, 132, 128],stride=[1,2,2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu30=tf.nn.leaky_relu(conv30,alpha=0.1) relu30 = tf_layers.layer_norm(relu30, scope='layer_norm30',reuse=reuse) #Layer 31: conv31=dcnv.deconv(relu30,[BATCH_SIZE,int(IMAGE_SIZE_H/2),int(IMAGE_SIZE_W/2),64],'d_conv31',[4, 4, 64, 128],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu31=tf.nn.leaky_relu(conv31,alpha=0.1) relu31 = tf_layers.layer_norm(relu31, scope='layer_norm31',reuse=reuse) #Layer 32: conv32=cnv.conv(relu31+relu29,'conv32',[5, 5, 64, 64],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu32=tf.nn.leaky_relu(conv32,alpha=0.1) relu32 = tf_layers.layer_norm(relu32, scope='layer_norm32',reuse=reuse) #Layer 33: conv33=dcnv.deconv(relu32,[BATCH_SIZE,int(IMAGE_SIZE_H),int(IMAGE_SIZE_W),32],'d_conv33',[4, 4, 32, 64],stride=[1, 2, 2, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu33=tf.nn.leaky_relu(conv33,alpha=0.1) relu33 = tf_layers.layer_norm(relu33, scope='layer_norm33',reuse=reuse) #Layer 34: conv34=cnv.conv(relu33+relu27,'conv34',[3, 3, 32, 32],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu34=tf.nn.leaky_relu(conv34,alpha=0.1) relu34 = tf_layers.layer_norm(relu34, scope='layer_norm34',reuse=reuse) #Layer 35: conv35=cnv.conv(relu34,'conv35',[3, 3, 32, 32],stride=[1,1,1, 1],padding='SAME',wd=WEIGHT_DECAY,FLOAT16=FLOAT16,reuse=reuse) relu35=tf.nn.leaky_relu(conv35,alpha=0.1) relu35 = tf_layers.layer_norm(relu35, scope='layer_norm35',reuse=reuse) #Inference layer 11 depth=cnv.conv(relu35,'depth',[3, 3, 32, 1],wd=0,FLOAT16=FLOAT16,reuse=reuse) #split scale1_depth=out_scale1[:,:,:,0] scale1_depth=tf.expand_dims(scale1_depth,3) norm_x1=out_scale1[:,:,:,1] norm_y1=out_scale1[:,:,:,2] norm_z1=out_scale1[:,:,:,3] norm_x1=tf.expand_dims(norm_x1,3) norm_y1=tf.expand_dims(norm_y1,3) norm_z1=tf.expand_dims(norm_z1,3) scale1_normal=tf.concat([norm_x1,norm_y1,norm_z1], 3) tf.summary.image('depth_scale1:', scale1_depth) tf.summary.image('normal_scale1:',scale1_normal) scale2_depth=out_scale2[:,:,:,0] scale2_depth=tf.expand_dims(scale2_depth,3) norm_x2=out_scale2[:,:,:,1] norm_y2=out_scale2[:,:,:,2] norm_z2=out_scale2[:,:,:,3] norm_x2=tf.expand_dims(norm_x2,3) norm_y2=tf.expand_dims(norm_y2,3) norm_z2=tf.expand_dims(norm_z2,3) scale2_normal=tf.concat([norm_x2,norm_y2,norm_z2], 3) tf.summary.image('depth_scale2:', scale2_depth) tf.summary.image('normal_scale2:',scale2_normal) tf.summary.image('depth_scale3:', depth) return scale1_depth, scale2_depth, depth,scale1_normal,scale2_normal
def build(self): with slim.arg_scope([ slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm ]): layer1 = tf_layers.layer_norm(self.vgg_layer(self.images), scope='conv1_norm') layer2 = tf_layers.layer_norm(slim.layers.conv2d(layer1, 32, [3, 3], stride=2, scope='conv2'), scope='conv2_norm') layer3 = tf_layers.layer_norm(slim.layers.conv2d(layer2, 32, [3, 3], stride=2, scope='conv3'), scope='conv3_norm') batch_size, num_rows, num_cols, num_fp = layer3.get_shape() # print 'shape', layer3.get_shape num_rows, num_cols, num_fp = [ int(x) for x in [num_rows, num_cols, num_fp] ] x_map = np.empty([num_rows, num_cols], np.float32) y_map = np.empty([num_rows, num_cols], np.float32) for i in range(num_rows): for j in range(num_cols): x_map[i, j] = (i - num_rows / 2.0) / num_rows y_map[i, j] = (j - num_cols / 2.0) / num_cols x_map = tf.convert_to_tensor(x_map) y_map = tf.convert_to_tensor(y_map) x_map = tf.reshape(x_map, [num_rows * num_cols]) y_map = tf.reshape(y_map, [num_rows * num_cols]) features = tf.reshape(tf.transpose(layer3, [0, 3, 1, 2]), [-1, num_rows * num_cols]) softmax = tf.nn.softmax(features) # print 'softmax', softmax fp_x = tf.reduce_sum(tf.multiply(x_map, softmax), [1], keep_dims=True) fp_y = tf.reduce_sum(tf.multiply(y_map, softmax), [1], keep_dims=True) self.fp_y = fp_y self.fp_x = fp_x # print 'fp_x', fp_x # print 'fp_y', fp_y fp_flat = tf.reshape(tf.concat([fp_x, fp_y], 1), [-1, num_fp * 2]) # print 'fp_flat', fp_flat # print 'configs', self.robot_configs self.predicted_eeps = slim.layers.fully_connected( fp_flat, 3, scope='predicted_eeps', activation_fn=None) # dim of eeps: 3 conv_out = tf.concat( [ fp_flat, self.robot_configs, # dim of angles: 7, dim of eeps: 3 self.predicted_eeps ], 1) fc_layer1 = slim.layers.fully_connected(conv_out, 100, scope='fc1') self.predicted_actions = slim.layers.fully_connected( fc_layer1, 7, scope='predicted_actions', activation_fn=None) # dim of velocities: 7
def _norm(self, inp, scope=None): reuse = tf.get_variable_scope().reuse normalized = layer_norm(inp, reuse=reuse, scope=scope) return normalized
def encoder_model(frames, sequence_length, initializer, scope='encoder', fc_conv_layer=False): """ Args: frames: 5D array of batch with videos - shape(batch_size, num_frames, frame_width, frame_higth, num_channels) sequence_length: number of frames that shall be encoded scope: tensorflow variable scope name initializer: specifies the initialization type (default: contrib.slim.layers uses Xavier init with uniform data) fc_conv_layer: adds an fc layer at the end of the encoder Returns: hidden4: hidden state of highest ConvLSTM layer fc_conv_layer: indicated whether a Fully Convolutional (8x8x16 -> 1x1x1024) shall be added """ lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5, lstm_state6 = None, None, None, None, None, None for i in range(sequence_length): frame = frames[:,i,:,:,:] reuse = (i > 0) with tf.variable_scope(scope, reuse=reuse): #LAYER 1: conv1 conv1 = slim.layers.conv2d(frame, 16, [5, 5], stride=2, scope='conv1', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, normalizer_params={'scope': 'layer_norm1'}) #LAYER 2: convLSTM1 hidden1, lstm_state1 = basic_conv_lstm_cell(conv1, lstm_state1, 16, initializer, filter_size=5, scope='convlstm1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') #LAYER 3: conv2 conv2 = slim.layers.conv2d(hidden1, hidden1.get_shape()[3], [5, 5], stride=2, scope='conv2', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, normalizer_params={'scope': 'layer_norm3'}) #LAYER 4: convLSTM2 hidden2, lstm_state2 = basic_conv_lstm_cell(conv2, lstm_state2, 16, initializer, filter_size=5, scope='convlstm2') hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm4') #LAYER 5: conv3 conv3 = slim.layers.conv2d(hidden2, hidden2.get_shape()[3], [5, 5], stride=2, scope='conv3', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, normalizer_params={'scope': 'layer_norm5'}) #LAYER 6: convLSTM3 hidden3, lstm_state3 = basic_conv_lstm_cell(conv3, lstm_state3, 16, initializer, filter_size=3, scope='convlstm3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm6') #LAYER 7: conv4 conv4 = slim.layers.conv2d(hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv4', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, normalizer_params={'scope': 'layer_norm7'}) #LAYER 8: convLSTM4 (8x8 featuremap size) hidden4, lstm_state4 = basic_conv_lstm_cell(conv4, lstm_state4, 32, initializer, filter_size=3, scope='convlstm4') hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm8') #LAYER 8: conv5 conv5 = slim.layers.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv5', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, normalizer_params={'scope': 'layer_norm9'}) # LAYER 9: convLSTM5 (4x84 featuremap size) hidden5, lstm_state5 = basic_conv_lstm_cell(conv5, lstm_state5, 32, initializer, filter_size=3, scope='convlstm5') hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm10') #LAYER 10: Fully Convolutional Layer (4x4x32 --> 1x1xFC_LAYER_SIZE) fc_conv = slim.layers.conv2d(hidden5, FC_LAYER_SIZE, [4,4], stride=1, scope='fc_conv', padding='VALID', weights_initializer=initializer) #LAYER 11: Fully Convolutional LSTM (1x1x256 -> 1x1x128) hidden6, lstm_state6 = basic_conv_lstm_cell(fc_conv, lstm_state6, FC_LSTM_LAYER_SIZE, initializer, filter_size=1, scope='convlstm6') hidden_repr = hidden6 return hidden_repr
def construct_model(images, actions=None, states=None, iter_num=-1.0, k=-1, num_masks=10, context_frames=2, pix_distributions=None, conf=None): if 'dna_size' in conf.keys(): DNA_KERN_SIZE = conf['dna_size'] else: DNA_KERN_SIZE = 5 print 'constructing sawyer network' batch_size, img_height, img_width, color_channels = images[0].get_shape( )[0:4] lstm_func = basic_conv_lstm_cell # Generated robot states and images. gen_states, gen_images, gen_masks = [], [], [] if states != None: current_state = states[0] else: current_state = None if actions == None: actions = [None for _ in images] gen_pix_distrib = [] summaries = [] if k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. num_ground_truth = tf.to_int32( tf.round( tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k))))) feedself = False # LSTM state sizes and states. if 'lstm_size' in conf: lstm_size = conf['lstm_size'] else: lstm_size = np.int32(np.array([16, 16, 32, 32, 64, 32, 16])) lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None lstm_state5, lstm_state6, lstm_state7 = None, None, None t = -1 for image, action in zip(images[:-1], actions[:-1]): t += 1 # Reuse variables after the first timestep. reuse = bool(gen_images) done_warm_start = len(gen_images) > context_frames - 1 with slim.arg_scope([ lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose ], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. prev_image = gen_images[-1] # 64x64x6 if pix_distributions != None: prev_pix_distrib = gen_pix_distrib[-1] elif done_warm_start: # Scheduled sampling prev_image = scheduled_sample(image, gen_images[-1], batch_size, num_ground_truth) else: # Always feed in ground_truth prev_image = image if pix_distributions != None: prev_pix_distrib = pix_distributions[t] prev_pix_distrib = tf.expand_dims(prev_pix_distrib, -1) if 'transform_from_firstimage' in conf: assert conf['model'] == 'STP' if t > 1: prev_image = images[1] print 'using image 1' # Predicted state is always fed back in if not 'ignore_state_action' in conf: state_action = tf.concat(1, [action, current_state]) enc0 = slim.layers.conv2d( #32x32x32 prev_image, 32, [5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( # 32x32x16 enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') enc1 = slim.layers.conv2d( # 16x16x16 hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden3, lstm_state3 = lstm_func( #16x16x32 enc1, lstm_state3, lstm_size[2], scope='state3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') enc2 = slim.layers.conv2d( # 8x8x32 hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv3') if not 'ignore_state_action' in conf: # Pass in state and action. if 'ignore_state' in conf: lowdim = action print 'ignoring state' else: lowdim = state_action smear = tf.reshape( lowdim, [int(batch_size), 1, 1, int(lowdim.get_shape()[1])]) smear = tf.tile( smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) enc2 = tf.concat(3, [enc2, smear]) else: print 'ignoring states and actions' enc3 = slim.layers.conv2d( #8x8x32 enc2, hidden3.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden5, lstm_state5 = lstm_func( #8x8x64 enc3, lstm_state5, lstm_size[4], scope='state5') hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6') enc4 = slim.layers.conv2d_transpose( #16x16x64 hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1') hidden6, lstm_state6 = lstm_func( #16x16x32 enc4, lstm_state6, lstm_size[5], scope='state6') hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7') if 'noskip' not in conf: # Skip connection. hidden6 = tf.concat(3, [hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( #32x32x32 hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2') hidden7, lstm_state7 = lstm_func( # 32x32x16 enc5, lstm_state7, lstm_size[6], scope='state7') hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8') if not 'noskip' in conf: # Skip connection. hidden7 = tf.concat(3, [hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( # 64x64x16 hidden7, hidden7.get_shape()[3], 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) if 'single_view' not in conf: prev_image_cam1 = tf.slice(prev_image, [0, 0, 0, 0], [-1, -1, -1, 3]) prev_image_cam2 = tf.slice(prev_image, [0, 0, 0, 3], [-1, -1, -1, 3]) if conf['model'] == 'DNA': # Using largest hidden state for predicting untied conv kernels. trafo_input_cam1 = slim.layers.conv2d_transpose( enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4_cam1') trafo_input_cam2 = slim.layers.conv2d_transpose( enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4_cam2') if 'single_view' not in conf: transformed_cam1 = [ dna_transformation(prev_image_cam1, trafo_input_cam1, conf['dna_size']) ] transformed_cam2 = [ dna_transformation(prev_image_cam2, trafo_input_cam2, conf['dna_size']) ] else: transformed_cam2 = [ dna_transformation(prev_image, trafo_input_cam2, conf['dna_size']) ] if conf['model'] == 'STP': stp_input0 = tf.reshape(hidden5, [int(batch_size), -1]) stp_input1_cam1 = slim.layers.fully_connected( stp_input0, 100 * conf['numcam'], scope='fc_stp_cam1') stp_input1_cam2 = slim.layers.fully_connected( stp_input0, 100 * conf['numcam'], scope='fc_stp_cam2') # disabling capability to generete pixels reuse_stp = None if reuse: reuse_stp = reuse if 'single_view' not in conf: transformed_cam1 = stp_transformation(prev_image_cam1, stp_input1_cam1, num_masks, reuse_stp, suffix='cam1') transformed_cam2 = stp_transformation(prev_image_cam2, stp_input1_cam2, num_masks, reuse_stp, suffix='cam2') # transformed += stp_transformation(prev_image, stp_input1, num_masks) if pix_distributions != None: transf_distrib = stp_transformation(prev_pix_distrib, stp_input1, num_masks, reuse=True) masks_cam1 = slim.layers.conv2d_transpose(enc6, (num_masks + 1), 1, stride=1, scope='convt7_cam1') masks_cam2 = slim.layers.conv2d_transpose(enc6, (num_masks + 1), 1, stride=1, scope='convt7_cam2') if 'single_view' not in conf: output_cam1, mask_list_cam1 = fuse_trafos( conf, masks_cam1, prev_image_cam1, transformed_cam1) output_cam2, mask_list_cam2 = fuse_trafos( conf, masks_cam2, prev_image_cam2, transformed_cam2) output = tf.concat(3, [output_cam1, output_cam2]) else: output, mask_list_cam2 = fuse_trafos(conf, masks_cam2, prev_image, transformed_cam2) gen_images.append(output) gen_masks.append(mask_list_cam2) if conf['model'] == 'DNA' and pix_distributions != None: transf_distrib = [ dna_transformation(prev_pix_distrib, enc7, DNA_KERN_SIZE) ] if pix_distributions != None: pix_distrib_output = mask_list[0] * prev_pix_distrib mult_list = [] for i in range(num_masks): mult_list.append(transf_distrib[i] * mask_list[i + 1]) pix_distrib_output += mult_list[i] gen_pix_distrib.append(pix_distrib_output) if current_state != None: current_state = slim.layers.fully_connected( state_action, int(current_state.get_shape()[1]), scope='state_pred', activation_fn=None) gen_states.append(current_state) if pix_distributions != None: return gen_images, gen_states, gen_masks, gen_pix_distrib else: return gen_images, gen_states, gen_masks, None
def build_network_core(self, action, current_state, input_image): lstm_func = basic_conv_lstm_cell with slim.arg_scope([ lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose ], reuse=self.reuse): enc0 = slim.layers.conv2d( # 32x32x32 input_image, 32, [5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, self.lstm_state1 = self.lstm_func( # 32x32x16 enc0, self.lstm_state1, self.lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') enc1 = slim.layers.conv2d( # 16x16x16 hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden3, self.lstm_state3 = self.lstm_func( # 16x16x32 enc1, self.lstm_state3, self.lstm_size[1], scope='state3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') enc2 = slim.layers.conv2d( # 8x8x32 hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv3') if not 'ignore_state_action' in self.conf: # Pass in state and action. state_action = tf.concat(axis=1, values=[action, current_state]) smear = tf.reshape(state_action, [ int(self.batch_size), 1, 1, int(state_action.get_shape()[1]) ]) smear = tf.tile( smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) enc2 = tf.concat(axis=3, values=[enc2, smear]) else: print('ignoring states and actions') enc3 = slim.layers.conv2d( # 8x8x32 enc2, hidden3.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden5, self.lstm_state5 = self.lstm_func( # 8x8x64 enc3, self.lstm_state5, self.lstm_size[2], scope='state5') hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6') enc4 = slim.layers.conv2d_transpose( # 16x16x64 hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1') hidden6, self.lstm_state6 = self.lstm_func( # 16x16x32 enc4, self.lstm_state6, self.lstm_size[3], scope='state6') hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7') if 'noskip' not in self.conf: # Skip connection. hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( # 32x32x32 hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2') hidden7, self.lstm_state7 = self.lstm_func( # 32x32x16 enc5, self.lstm_state7, self.lstm_size[4], scope='state7') hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8') if not 'noskip' in self.conf: # Skip connection. hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( # 64x64x16 hidden7, hidden7.get_shape()[3], 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) if current_state != None: current_state = slim.layers.fully_connected( state_action, int(current_state.get_shape()[1]), scope='state_pred', activation_fn=None) self.gen_states.append(current_state) self.apply_trafo_predict(enc6, hidden5) return current_state
def build(self): if 'kern_size' in list(self.conf.keys()): KERN_SIZE = self.conf['kern_size'] else: KERN_SIZE = 5 batch_size, img_height, img_width, color_channels = self.images[ 0].get_shape()[0:4] lstm_func = basic_conv_lstm_cell if self.states != None: current_state = self.states[0] else: current_state = None if self.actions == None: self.actions = [None for _ in self.images] if self.k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. num_ground_truth = tf.to_int32( tf.round( tf.to_float(batch_size) * (self.k / (self.k + tf.exp(self.iter_num / self.k))))) feedself = False # LSTM state sizes and states. if 'lstm_size' in self.conf: lstm_size = self.conf['lstm_size'] print('using lstm size', lstm_size) else: lstm_size = np.int32(np.array([16, 32, 64, 32, 16])) lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None lstm_state5, lstm_state6, lstm_state7 = None, None, None t = -1 for image, action in zip(self.images[:-1], self.actions[:-1]): t += 1 print(t) # Reuse variables after the first timestep. reuse = bool(self.gen_images) done_warm_start = len(self.gen_images) > self.context_frames - 1 with slim.arg_scope([ lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose ], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. prev_image = self.gen_images[-1] # 64x64x6 if self.pix_distributions1 != None: prev_pix_distrib1 = self.gen_distrib1[-1] if 'ndesig' in self.conf: prev_pix_distrib2 = self.gen_distrib2[-1] elif done_warm_start: # Scheduled sampling prev_image = scheduled_sample(image, self.gen_images[-1], batch_size, num_ground_truth) else: # Always feed in ground_truth prev_image = image if self.pix_distributions1 != None: prev_pix_distrib1 = self.pix_distributions1[t] if 'ndesig' in self.conf: prev_pix_distrib2 = self.pix_distributions2[t] if len(prev_pix_distrib1.get_shape()) == 3: prev_pix_distrib1 = tf.expand_dims( prev_pix_distrib1, -1) if 'ndesig' in self.conf: prev_pix_distrib2 = tf.expand_dims( prev_pix_distrib2, -1) if 'refeed_firstimage' in self.conf: assert self.conf['model'] == 'STP' if t > 1: input_image = self.images[1] print('refeed with image 1') else: input_image = prev_image else: input_image = prev_image # Predicted state is always fed back in if not 'ignore_state_action' in self.conf: state_action = tf.concat(axis=1, values=[action, current_state]) enc0 = slim.layers.conv2d( #32x32x32 input_image, 32, [5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( # 32x32x16 enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') enc1 = slim.layers.conv2d( # 16x16x16 hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden3, lstm_state3 = lstm_func( #16x16x32 enc1, lstm_state3, lstm_size[1], scope='state3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') enc2 = slim.layers.conv2d( # 8x8x32 hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv3') if not 'ignore_state_action' in self.conf: # Pass in state and action. if 'ignore_state' in self.conf: lowdim = action print('ignoring state') else: lowdim = state_action smear = tf.reshape( lowdim, [int(batch_size), 1, 1, int(lowdim.get_shape()[1])]) smear = tf.tile(smear, [ 1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1 ]) enc2 = tf.concat(axis=3, values=[enc2, smear]) else: print('ignoring states and actions') enc3 = slim.layers.conv2d( #8x8x32 enc2, hidden3.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden5, lstm_state5 = lstm_func( #8x8x64 enc3, lstm_state5, lstm_size[2], scope='state5') hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6') enc4 = slim.layers.conv2d_transpose( #16x16x64 hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1') hidden6, lstm_state6 = lstm_func( #16x16x32 enc4, lstm_state6, lstm_size[3], scope='state6') hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7') if 'noskip' not in self.conf: # Skip connection. hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( #32x32x32 hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2') hidden7, lstm_state7 = lstm_func( # 32x32x16 enc5, lstm_state7, lstm_size[4], scope='state7') hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8') if not 'noskip' in self.conf: # Skip connection. hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( # 64x64x16 hidden7, hidden7.get_shape()[3], 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) if 'transform_from_firstimage' in self.conf: prev_image = self.images[1] if self.pix_distributions1 != None: prev_pix_distrib1 = self.pix_distributions1[1] prev_pix_distrib1 = tf.expand_dims( prev_pix_distrib1, -1) print('transform from image 1') if self.conf['model'] == 'DNA': # Using largest hidden state for predicting untied conv kernels. if 'separable_filters' in self.conf: num_filters = KERN_SIZE * 2 else: num_filters = KERN_SIZE**2 trafo_input = slim.layers.conv2d_transpose( enc6, num_filters, 1, stride=1, scope='convt4_cam2') transformed_l, _ = self.dna_transformation( self.conf, prev_image, trafo_input) if self.pix_distributions1 != None: transf_distrib_ndesig1, _ = self.dna_transformation( self.conf, prev_pix_distrib1, trafo_input) if 'ndesig' in self.conf: transf_distrib_ndesig2, _ = self.dna_transformation( self.conf, prev_pix_distrib2, trafo_input) extra_masks = 1 if self.conf['model'] == 'CDNA': if 'gen_pix' in self.conf: enc7 = slim.layers.conv2d_transpose(enc6, color_channels, 1, stride=1, scope='convt4') transformed_l = [tf.nn.sigmoid(enc7)] extra_masks = 2 else: transformed_l = [] extra_masks = 1 if 'mov_bckgd' in self.conf: extra_masks = self.num_masks cdna_input = tf.reshape(hidden5, [int(batch_size), -1]) new_transformed, cdna_kerns = self.cdna_transformation( self.conf, prev_image, cdna_input, reuse_sc=reuse) transformed_l += new_transformed self.moved_images.append(transformed_l) ## move the background is chosen: if 'mov_bckgd' in self.conf: cdna_input = tf.reshape(hidden5, [int(self.batch_size), -1]) bckgd_transformed, _ = self.cdna_transformation( self.conf, self.images[0], cdna_input, reuse_sc=reuse, scope='bckgd_trafo') self.moved_bckgd.append(bckgd_transformed) if self.pix_distributions1 != None: transf_distrib_ndesig1, _ = self.cdna_transformation( self.conf, prev_pix_distrib1, cdna_input, reuse_sc=True) self.moved_pix_distrib1.append(transf_distrib_ndesig1) if 'mov_bckgd' in self.conf: bcgkd_distrib = tf.reshape( self.pix_distributions1[0], (self.batch_size, 64, 64, 1)) transf_distrib_bckgd, _ = self.cdna_transformation( self.conf, bcgkd_distrib, cdna_input, reuse_sc=True, scope='bckgd_trafo') if 'ndesig' in self.conf: transf_distrib_ndesig2, _ = self.cdna_transformation( self.conf, prev_pix_distrib2, cdna_input, reuse_sc=True) self.moved_pix_distrib2.append( transf_distrib_ndesig2) if '1stimg_bckgd' in self.conf: background = self.images[0] print('using background from first image..') else: background = prev_image if 'mov_bckgd' in self.conf: output, mask_list, moved_parts = self.fuse_trafos_movbckgd( enc6, bckgd_transformed, transformed_l, scope='convt7_cam2', extra_masks=extra_masks, reuse=reuse) self.movd_parts_list.append(moved_parts) else: output, mask_list = self.fuse_trafos( enc6, background, transformed_l, scope='convt7_cam2', extra_masks=extra_masks) self.gen_images.append(output) self.gen_masks.append(mask_list) if self.pix_distributions1 != None: if 'mov_bckgd' in self.conf: pix_distrib_output = self.fuse_pix_movebckgd( mask_list, transf_distrib_ndesig1, transf_distrib_bckgd) else: pix_distrib_output = self.fuse_pix_distrib( extra_masks, mask_list, self.pix_distributions1, prev_pix_distrib1, transf_distrib_ndesig1) self.gen_distrib1.append(pix_distrib_output) if 'ndesig' in self.conf: pix_distrib_output = self.fuse_pix_distrib( extra_masks, mask_list, self.pix_distributions2, prev_pix_distrib2, transf_distrib_ndesig2) self.gen_distrib2.append(pix_distrib_output) if 'visual_flowvec' in self.conf: motion_vecs = self.compute_motion_vector(cdna_kerns) output = tf.zeros([self.conf['batch_size'], 64, 64, 2]) for vec, mask in zip(motion_vecs, mask_list[1:]): vec = tf.reshape(vec, [self.conf['batch_size'], 1, 1, 2]) vec = tf.tile(vec, [1, 64, 64, 1]) output += vec * mask self.flow_vectors.append(output) if current_state != None: current_state = slim.layers.fully_connected( state_action, int(current_state.get_shape()[1]), scope='state_pred', activation_fn=None) self.gen_states.append(current_state)
def build(self): if 'kern_size' in list(self.conf.keys()): KERN_SIZE = self.conf['kern_size'] else: KERN_SIZE = 5 batch_size, img_height, img_width, color_channels = self.images[0].get_shape()[0:4] lstm_func = basic_conv_lstm_cell if self.states != None: current_state = self.states[0] else: current_state = None if self.actions == None: self.actions = [None for _ in self.images] if self.k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. num_ground_truth = tf.to_int32( tf.round(tf.to_float(batch_size) * (self.k / (self.k + tf.exp(self.iter_num / self.k))))) feedself = False # LSTM state sizes and states. if 'lstm_size' in self.conf: lstm_size = self.conf['lstm_size'] print('using lstm size', lstm_size) else: lstm_size = np.int32(np.array([16, 32, 64, 32, 16])) lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None lstm_state5, lstm_state6, lstm_state7 = None, None, None t = -1 self.T = len(self.images) for image, action in zip(self.images[:-1], self.actions[:-1]): t +=1 print(t) # Reuse variables after the first timestep. reuse = bool(self.gen_images) done_warm_start = len(self.gen_images) > self.ncontext - 1 with slim.arg_scope( [lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. prev_image = self.gen_images[-1] # 64x64x6 if self.pix_distributions1 != None: prev_pix_distrib1 = self.gen_distrib1[-1] if 'ndesig' in self.conf: prev_pix_distrib2 = self.gen_distrib2[-1] elif done_warm_start: # Scheduled sampling prev_image = scheduled_sample(image, self.gen_images[-1], batch_size, num_ground_truth) else: # Always feed in ground_truth prev_image = image if self.pix_distributions1 != None: prev_pix_distrib1 = self.pix_distributions1[t] if 'ndesig' in self.conf: prev_pix_distrib2 = self.pix_distributions2[t] if len(prev_pix_distrib1.get_shape()) == 3: prev_pix_distrib1 = tf.expand_dims(prev_pix_distrib1, -1) if 'ndesig' in self.conf: prev_pix_distrib2 = tf.expand_dims(prev_pix_distrib2, -1) if 'refeed_firstimage' in self.conf: assert self.conf['model']=='STP' if t > 1: input_image = self.images[1] print('refeed with image 1') else: input_image = prev_image else: input_image = prev_image # Predicted state is always fed back in if not 'ignore_state_action' in self.conf: state_action = tf.concat(axis=1, values=[action, current_state]) enc0 = slim.layers.conv2d( #32x32x32 input_image, 32, [5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( # 32x32x16 enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') enc1 = slim.layers.conv2d( # 16x16x16 hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden3, lstm_state3 = lstm_func( #16x16x32 enc1, lstm_state3, lstm_size[1], scope='state3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') enc2 = slim.layers.conv2d( # 8x8x32 hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv3') if not 'ignore_state_action' in self.conf: # Pass in state and action. if 'ignore_state' in self.conf: lowdim = action print('ignoring state') else: lowdim = state_action smear = tf.reshape( lowdim, [int(batch_size), 1, 1, int(lowdim.get_shape()[1])]) smear = tf.tile( smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) enc2 = tf.concat(axis=3, values=[enc2, smear]) else: print('ignoring states and actions') enc3 = slim.layers.conv2d( #8x8x32 enc2, hidden3.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden5, lstm_state5 = lstm_func( #8x8x64 enc3, lstm_state5, lstm_size[2], scope='state5') hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6') enc4 = slim.layers.conv2d_transpose( #16x16x64 hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1') hidden6, lstm_state6 = lstm_func( #16x16x32 enc4, lstm_state6, lstm_size[3], scope='state6') hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7') if 'noskip' not in self.conf: # Skip connection. hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( #32x32x32 hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2') hidden7, lstm_state7 = lstm_func( # 32x32x16 enc5, lstm_state7, lstm_size[4], scope='state7') hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8') if not 'noskip' in self.conf: # Skip connection. hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( # 64x64x16 hidden7, hidden7.get_shape()[3], 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) im_history = self.assemble_history(t) if self.conf['model']=='DNA': # Using largest hidden state for predicting untied conv kernels. trafo_input = slim.layers.conv2d_transpose( enc6, KERN_SIZE ** 2, 1, stride=1, scope='convt4_cam2') transformed_l = [self.dna_transformation(prev_image, trafo_input, self.conf['kern_size'])] if self.pix_distributions1 != None: transf_distrib_ndesig1 = [self.dna_transformation(prev_pix_distrib1, trafo_input, KERN_SIZE)] if 'ndesig' in self.conf: transf_distrib_ndesig2 = [ self.dna_transformation(prev_pix_distrib2, trafo_input, KERN_SIZE)] total_masks = 1 if self.conf['model'] == 'CDNA': total_masks = (self.T-1)*self.num_masks cdna_input = tf.reshape(hidden5, [int(batch_size), -1]) transformed_l = [] for i, h_image in enumerate(im_history): transformed, _ = self.cdna_transformation(h_image, cdna_input, reuse_sc=reuse, scope='cdna_from{}'.format(i)) transformed_l+=transformed output, mask_list = self.fuse_trafos(enc6, transformed_l, scope='convt7_cam2', total_masks=total_masks) self.moved_images.append(transformed_l) if self.pix_distributions1 != None: transf_distrib_ndesig1, _ = self.cdna_transformation(prev_pix_distrib1, cdna_input, reuse_sc=True) self.moved_pix_distrib1.append(transf_distrib_ndesig1) self.moved_images.append(transformed_l) self.gen_images.append(output) self.gen_masks.append(mask_list) if self.pix_distributions1!=None: pix_distrib_output = self.fuse_pix_distrib(total_masks, mask_list, self.pix_distributions1, prev_pix_distrib1, transf_distrib_ndesig1) self.gen_distrib1.append(pix_distrib_output) if current_state != None: current_state = slim.layers.fully_connected( state_action, int(current_state.get_shape()[1]), scope='state_pred', activation_fn=None) self.gen_states.append(current_state)
def forward(images, index, dna, cdna, num_masks=10): stime = time.time() batch_size, img_height, img_width = images[0].get_shape()[0:3] lstm_func = basic_conv_lstm_cell # Generated robot states and images. gen_images = [] lstm_size = np.int32(np.array([32, 32, 64, 64, 128, 64, 32])) lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None lstm_state5, lstm_state6, lstm_state7 = None, None, None for i in range(images.__len__()): # Reuse variables after the first timestep. reuse = (i > 0) with slim.arg_scope([ lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose ], reuse=reuse): if reuse and i > index: prev_image = tf.reshape(gen_images[-1], [batch_size, img_height, img_width, 1]) else: prev_image = tf.reshape(images[i], [batch_size, img_height, img_width, 1]) enc0 = slim.layers.conv2d( prev_image, 32, 5, stride=1, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func(enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') hidden2, lstm_state2 = lstm_func(hidden1, lstm_state2, lstm_size[1], scope='state2') hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3') enc1 = slim.layers.conv2d(hidden2, hidden2.get_shape()[3], [3, 3], stride=1, scope='conv2') hidden3, lstm_state3 = lstm_func(enc1, lstm_state3, lstm_size[2], scope='state3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') hidden4, lstm_state4 = lstm_func(hidden3, lstm_state4, lstm_size[3], scope='state4') hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm5') output = slim.layers.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], stride=1, scope='conv3') if (i > index - 1): gen_images.append(output[:, :, :, 0:1]) # print(gen_images.name) print(time.time() - stime) return gen_images
if FLAGS.max_pool: conv_output = tf.nn.conv2d(inp, cweight, no_stride, 'SAME') + bweight else: conv_output = tf.nn.conv2d(inp, cweight, stride, 'SAME') + bweight normed = normalize(conv_output, activation, reuse, scope) if FLAGS.max_pool: normed = tf.nn.max_pool(normed, stride, stride, max_pool_pad) return normed def normalize(inp, activation, reuse, scope): if FLAGS.norm == 'batch_norm': "Batch Normalization" return tf_layers.batch_norm(inp, activation_fn=activation, reuse=reuse, scope=scope) elif FLAGS.norm == 'layer_norm': "Layer Normalization" return tf_layers.layer_norm(inp, activation_fn=activation, reuse=reuse, scope=scope) elif FLAGS.norm == 'None': return activation(inp) if activation is not None else inp ## Loss functions def mse(pred, label): "均方误差" pred = tf.reshape(pred, [-1]) label = tf.reshape(label, [-1]) return tf.reduce_mean(tf.square(pred-label)) def xent(pred, label): # Note - with tf version <=0.12, this loss has incorrect 2nd derivatives "交叉熵" return tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=label) / FLAGS.update_batch_size
def build(self): if 'kern_size' in self.conf.keys(): KERN_SIZE = self.conf['kern_size'] else: KERN_SIZE = 5 batch_size, img_height, img_width, color_channels = self.images[0].get_shape()[0:4] lstm_func = basic_conv_lstm_cell if self.states != None: current_state = self.states[0] else: current_state = None if self.actions == None: self.actions = [None for _ in self.images] if self.k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. num_ground_truth = tf.to_int32( tf.round(tf.to_float(batch_size) * (self.k / (self.k + tf.exp(self.iter_num / self.k))))) feedself = False # LSTM state sizes and states. if 'lstm_size' in self.conf: lstm_size = self.conf['lstm_size'] print('using lstm size', lstm_size) else: ngf = self.conf['ngf'] lstm_size = np.int32(np.array([ngf, ngf * 2, ngf * 4, ngf * 2, ngf])) lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None lstm_state5, lstm_state6, lstm_state7 = None, None, None for t, action in enumerate(self.actions): print(t) # Reuse variables after the first timestep. reuse = bool(self.gen_images) done_warm_start = len(self.gen_images) > self.context_frames - 1 with slim.arg_scope( [lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. prev_image = self.gen_images[-1] # 64x64x6 if self.pix_distributions1 != None: prev_pix_distrib1 = self.gen_distrib1[-1] if 'ndesig' in self.conf: prev_pix_distrib2 = self.gen_distrib2[-1] elif done_warm_start: # Scheduled sampling prev_image = scheduled_sample(self.images[t], self.gen_images[-1], batch_size, num_ground_truth) else: # Always feed in ground_truth prev_image = self.images[t] if self.pix_distributions1 != None: prev_pix_distrib1 = self.pix_distributions1[t] if 'ndesig' in self.conf: prev_pix_distrib2 = self.pix_distributions2[t] if len(prev_pix_distrib1.get_shape()) == 3: prev_pix_distrib1 = tf.expand_dims(prev_pix_distrib1, -1) if 'ndesig' in self.conf: prev_pix_distrib2 = tf.expand_dims(prev_pix_distrib2, -1) if 'refeed_firstimage' in self.conf: assert self.conf['model']=='STP' if t > 1: input_image = self.images[1] print('refeed with image 1') else: input_image = prev_image else: input_image = prev_image # Predicted state is always fed back in if not 'ignore_state_action' in self.conf: state_action = tf.concat(axis=1, values=[action, current_state]) enc0 = slim.layers.conv2d( #32x32x32 input_image, 32, [5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( # 32x32x16 enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') enc1 = slim.layers.conv2d( # 16x16x16 hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden3, lstm_state3 = lstm_func( #16x16x32 enc1, lstm_state3, lstm_size[1], scope='state3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') enc2 = slim.layers.conv2d( # 8x8x32 hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv3') if not 'ignore_state_action' in self.conf: # Pass in state and action. if 'ignore_state' in self.conf: lowdim = action print('ignoring state') else: lowdim = state_action smear = tf.reshape( lowdim, [int(batch_size), 1, 1, int(lowdim.get_shape()[1])]) smear = tf.tile( smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) enc2 = tf.concat(axis=3, values=[enc2, smear]) else: print('ignoring states and actions') enc3 = slim.layers.conv2d( #8x8x32 enc2, hidden3.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden5, lstm_state5 = lstm_func( #8x8x64 enc3, lstm_state5, lstm_size[2], scope='state5') hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6') enc4 = slim.layers.conv2d_transpose( #16x16x64 hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1') hidden6, lstm_state6 = lstm_func( #16x16x32 enc4, lstm_state6, lstm_size[3], scope='state6') hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7') if 'noskip' not in self.conf: # Skip connection. hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( #32x32x32 hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2') hidden7, lstm_state7 = lstm_func( # 32x32x16 enc5, lstm_state7, lstm_size[4], scope='state7') hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8') if not 'noskip' in self.conf: # Skip connection. hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( # 64x64x16 hidden7, hidden7.get_shape()[3], 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) if 'transform_from_firstimage' in self.conf: prev_image = self.images[1] if self.pix_distributions1 != None: prev_pix_distrib1 = self.pix_distributions1[1] prev_pix_distrib1 = tf.expand_dims(prev_pix_distrib1, -1) print('transform from image 1') if self.conf['model'] == 'DNA': # Using largest hidden state for predicting untied conv kernels. trafo_input = slim.layers.conv2d_transpose( enc6, KERN_SIZE ** 2, 1, stride=1, scope='convt4_cam2') transformed_l = [self.dna_transformation(prev_image, trafo_input, self.conf['kern_size'])] if self.pix_distributions1 != None: transf_distrib_ndesig1 = [self.dna_transformation(prev_pix_distrib1, trafo_input, KERN_SIZE)] if 'ndesig' in self.conf: transf_distrib_ndesig2 = [ self.dna_transformation(prev_pix_distrib2, trafo_input, KERN_SIZE)] extra_masks = 1 ## extra_masks = 2 is needed for running singleview_shifted!! # print('using extra masks 2 because of single view shifted!!') # extra_masks = 2 if self.conf['model'] == 'CDNA': if 'gen_pix' in self.conf: # Using largest hidden state for predicting a new image layer. enc7 = slim.layers.conv2d_transpose( enc6, color_channels, 1, stride=1, scope='convt4', activation_fn=None) # This allows the network to also generate one image from scratch, # which is useful when regions of the image become unoccluded. transformed_l = [tf.nn.sigmoid(enc7)] extra_masks = 2 else: transformed_l = [] extra_masks = 1 cdna_input = tf.reshape(hidden5, [int(batch_size), -1]) new_transformed, _ = self.cdna_transformation(prev_image, cdna_input, reuse_sc=reuse) transformed_l += new_transformed self.moved_images.append(transformed_l) if self.pix_distributions1 != None: transf_distrib_ndesig1, _ = self.cdna_transformation(prev_pix_distrib1, cdna_input, reuse_sc=True) self.moved_pix_distrib1.append(transf_distrib_ndesig1) if 'ndesig' in self.conf: transf_distrib_ndesig2, _ = self.cdna_transformation( prev_pix_distrib2, cdna_input, reuse_sc=True) self.moved_pix_distrib2.append(transf_distrib_ndesig2) if self.conf['model'] == 'STP': enc7 = slim.layers.conv2d_transpose(enc6, color_channels, 1, stride=1, scope='convt5', activation_fn= None) # This allows the network to also generate one image from scratch, # which is useful when regions of the image become unoccluded. if 'gen_pix' in self.conf: transformed_l = [tf.nn.sigmoid(enc7)] extra_masks = 2 else: transformed_l = [] extra_masks = 1 enc_stp = tf.reshape(hidden5, [int(batch_size), -1]) stp_input = slim.layers.fully_connected( enc_stp, 200, scope='fc_stp_cam2') # disabling capability to generete pixels reuse_stp = None if reuse: reuse_stp = reuse # enable the generation of pixels: transformed, trafo = self.stp_transformation(prev_image, stp_input, self.num_masks, reuse_stp, suffix='cam2') transformed_l += transformed self.trafos.append(trafo) self.moved_images.append(transformed_l) if self.pix_distributions1 != None: transf_distrib_ndesig1, _ = self.stp_transformation(prev_pix_distrib1, stp_input, suffix='cam2', reuse=True) self.moved_pix_distrib1.append(transf_distrib_ndesig1) if '1stimg_bckgd' in self.conf: background = self.images[0] print('using background from first image..') else: background = prev_image output, mask_list = self.fuse_trafos(enc6, background, transformed_l, scope='convt7_cam2', extra_masks= extra_masks) self.gen_images.append(output) self.gen_masks.append(mask_list) if self.pix_distributions1!=None: pix_distrib_output = self.fuse_pix_distrib(extra_masks, mask_list, self.pix_distributions1, prev_pix_distrib1, transf_distrib_ndesig1) self.gen_distrib1.append(pix_distrib_output) if 'ndesig' in self.conf: pix_distrib_output = self.fuse_pix_distrib(extra_masks, mask_list, self.pix_distributions2, prev_pix_distrib2, transf_distrib_ndesig2) self.gen_distrib2.append(pix_distrib_output) if int(current_state.get_shape()[1]) == 0: current_state = tf.zeros_like(state_action) else: current_state = slim.layers.fully_connected( state_action, int(current_state.get_shape()[1]), scope='state_pred', activation_fn=None) self.gen_states.append(current_state)
def construct_model(images, actions=None, states=None, iter_num=-1.0, k=-1, use_state=True, num_masks=10, stp=False, cdna=True, dna=False, context_frames=2): """Build convolutional lstm video predictor using STP, CDNA, or DNA. Args: images: tensor of ground truth image sequences actions: tensor of action sequences states: tensor of ground truth state sequences iter_num: tensor of the current training iteration (for sched. sampling) k: constant used for scheduled sampling. -1 to feed in own prediction. use_state: True to include state and action in prediction num_masks: the number of different pixel motion predictions (and the number of masks for each of those predictions) stp: True to use Spatial Transformer Predictor (STP) cdna: True to use Convoluational Dynamic Neural Advection (CDNA) dna: True to use Dynamic Neural Advection (DNA) context_frames: number of ground truth frames to pass in before feeding in own predictions Returns: gen_images: predicted future image frames gen_states: predicted future states Raises: ValueError: if more than one network option specified or more than 1 mask specified for DNA model. """ if stp + cdna + dna != 1: raise ValueError('More than one, or no network option specified.') batch_size, img_height, img_width, color_channels = images[0].get_shape()[0:4] lstm_func = basic_conv_lstm_cell # Generated robot states and images. gen_states, gen_images = [], [] current_state = states[0] if k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. num_ground_truth = tf.to_int32( tf.round(tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k))))) feedself = False # LSTM state sizes and states. lstm_size = np.int32(np.array([32, 32, 64, 64, 128, 64, 32])) lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None lstm_state5, lstm_state6, lstm_state7 = None, None, None for image, action in zip(images[:-1], actions[:-1]): # Reuse variables after the first timestep. reuse = bool(gen_images) done_warm_start = len(gen_images) > context_frames - 1 with slim.arg_scope( [lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. prev_image = gen_images[-1] elif done_warm_start: # Scheduled sampling prev_image = scheduled_sample(image, gen_images[-1], batch_size, num_ground_truth) else: # Always feed in ground_truth prev_image = image # Predicted state is always fed back in state_action = tf.concat(axis=1, values=[action, current_state]) enc0 = slim.layers.conv2d( prev_image, 32, [5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') hidden2, lstm_state2 = lstm_func( hidden1, lstm_state2, lstm_size[1], scope='state2') hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3') enc1 = slim.layers.conv2d( hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden3, lstm_state3 = lstm_func( enc1, lstm_state3, lstm_size[2], scope='state3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') hidden4, lstm_state4 = lstm_func( hidden3, lstm_state4, lstm_size[3], scope='state4') hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm5') enc2 = slim.layers.conv2d( hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv3') # Pass in state and action. smear = tf.reshape( state_action, [int(batch_size), 1, 1, int(state_action.get_shape()[1])]) smear = tf.tile( smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) if use_state: enc2 = tf.concat(axis=3, values=[enc2, smear]) enc3 = slim.layers.conv2d( enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden5, lstm_state5 = lstm_func( enc3, lstm_state5, lstm_size[4], scope='state5') # last 8x8 hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6') enc4 = slim.layers.conv2d_transpose( hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1') hidden6, lstm_state6 = lstm_func( enc4, lstm_state6, lstm_size[5], scope='state6') # 16x16 hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7') # Skip connection. hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2') hidden7, lstm_state7 = lstm_func( enc5, lstm_state7, lstm_size[6], scope='state7') # 32x32 hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8') # Skip connection. hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( hidden7, hidden7.get_shape()[3], 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) if dna: # Using largest hidden state for predicting untied conv kernels. enc7 = slim.layers.conv2d_transpose( enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4') else: # Using largest hidden state for predicting a new image layer. enc7 = slim.layers.conv2d_transpose( enc6, color_channels, 1, stride=1, scope='convt4') # This allows the network to also generate one image from scratch, # which is useful when regions of the image become unoccluded. transformed = [tf.nn.sigmoid(enc7)] if stp: stp_input0 = tf.reshape(hidden5, [int(batch_size), -1]) stp_input1 = slim.layers.fully_connected( stp_input0, 100, scope='fc_stp') transformed += stp_transformation(prev_image, stp_input1, num_masks) elif cdna: cdna_input = tf.reshape(hidden5, [int(batch_size), -1]) transformed += cdna_transformation(prev_image, cdna_input, num_masks, int(color_channels)) elif dna: # Only one mask is supported (more should be unnecessary). if num_masks != 1: raise ValueError('Only one mask is supported for DNA model.') transformed = [dna_transformation(prev_image, enc7)] masks = slim.layers.conv2d_transpose( enc6, num_masks + 1, 1, stride=1, scope='convt7') masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [int(batch_size), int(img_height), int(img_width), num_masks + 1]) mask_list = tf.split(axis=3, num_or_size_splits=num_masks + 1, value=masks) output = mask_list[0] * prev_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask gen_images.append(output) current_state = slim.layers.fully_connected( state_action, int(current_state.get_shape()[1]), scope='state_pred', activation_fn=None) gen_states.append(current_state) return gen_images, gen_states
def construct_model(images, actions=None, states=None, iter_num=-1.0, k=-1, use_state=True, context_frames=2, conf=None): """Build convolutional lstm video predictor using STP, CDNA, or DNA. Args: images: tensor of ground truth image sequences actions: tensor of action sequences states: tensor of ground truth state sequences iter_num: tensor of the current training iteration (for sched. sampling) k: constant used for scheduled sampling. -1 to feed in own prediction. use_state: True to include state and action in prediction num_masks: the number of different pixel motion predictions (and the number of masks for each of those predictions) stp: True to use Spatial Transformer Predictor (STP) cdna: True to use Convoluational Dynamic Neural Advection (CDNA) dna: True to use Dynamic Neural Advection (DNA) context_frames: number of ground truth frames to pass in before feeding in own predictions pix_distrib: the initial one-hot distriubtion for designated pixels Returns: gen_images: predicted future image frames gen_states: predicted future states Raises: ValueError: if more than one network option specified or more than 1 mask specified for DNA model. """ if 'dna_size' in conf.keys(): DNA_KERN_SIZE = conf['dna_size'] else: DNA_KERN_SIZE = 5 print 'constructing network with hidden state...' batch_size, img_height, img_width, color_channels = images[0].get_shape( )[0:4] batch_size = int(batch_size) lstm_func = basic_conv_lstm_cell # Generated robot states and images. gen_states, gen_images, gen_masks, inf_low_state_list, pred_low_state_list = [], [], [], [], [] current_state = states[0] gen_pix_distrib = [] summaries = [] if k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. num_ground_truth = tf.to_int32( tf.round( tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k))))) feedself = False # LSTM state sizes and states. lstm_size = np.int32(np.array([16, 32, 64, 100, 10])) lstm_state1, lstm_state2, lstm_state3 = None, None, None for t, image, action in zip(range(len(images)), images[:-1], actions[:-1]): # Reuse variables after the first timestep. reuse = bool(gen_images) done_warm_start = len(gen_images) > context_frames - 1 with slim.arg_scope([ lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose ], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. prev_image = gen_images[-1] elif done_warm_start: # Scheduled sampling prev_image = scheduled_sample(image, gen_images[-1], batch_size, num_ground_truth) else: # Always feed in ground_truth prev_image = image if (not 'prop_latent' in conf) or t < 2: # encode! print 'encode {}'.format(t) # Predicted state is always fed back in state_action = tf.concat(1, [action, current_state]) # 6x enc0 = slim.layers.conv2d( #32x32x32 prev_image, 32, kernel_size=[5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( #32x32x16 enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') enc1 = slim.layers.conv2d( #16x16x16 hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden2, lstm_state2 = lstm_func( #16x16x32 enc1, lstm_state2, lstm_size[1], scope='state3') hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3') enc2 = slim.layers.conv2d( #8x8x32 hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope='conv3') # Pass in state and action. smear = tf.reshape( state_action, [batch_size, 1, 1, int(state_action.get_shape()[1])]) smear = tf.tile( #8x8x6 smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) if use_state: enc2 = tf.concat(3, [enc2, smear]) enc3 = slim.layers.conv2d( #8x8x32 enc2, hidden2.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden3, lstm_state3 = lstm_func( #8x8x64 enc3, lstm_state3, lstm_size[2], scope='state5') # last 8x8 hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') enc3 = slim.layers.conv2d( # 8x8x32 hidden3, 32, [1, 1], stride=1, scope='conv5') if 'num_lt_featuremaps' in conf: enc4_num_ft_mps = conf['num_lt_featuremaps'] else: enc4_num_ft_mps = 8 enc4 = slim.layers.conv2d( # 8x8x enc4_num_ft_mps enc3, enc4_num_ft_mps, [3, 3], stride=1, scope='conv6') if '4x4lowdim' in conf: enc5 = slim.layers.conv2d( # 8x8x1 enc4, 1, [3, 3], stride=1, scope='conv7') low_dim_state = slim.layers.conv2d( # 4x4x1 enc5, 1, [3, 3], stride=2, scope='conv8') else: if 'num_lt_featuremaps' in conf: num_lt_feature = conf['num_lt_featuremaps'] else: num_lt_feature = 1 print 'number of latent featrue maps: ', num_lt_feature low_dim_state = slim.layers.conv2d( # 8x8xnum_lt_feature enc4, num_lt_feature, [3, 3], stride=1, scope='conv7') inf_low_state_list.append(low_dim_state) pred_low_state_list.append( project_fwd_lowdim(conf, low_dim_state)) ## start decoding from here: print 'decode with inferred lt-state at t{}'.format(t) else: #when propagating latent t = 2,3,... assert '4x4lowdim' not in conf print 'decode with predicted lt-state at t{}'.format(t) pred_low_state_list.append( project_fwd_lowdim(conf, pred_low_state_list[-1])) low_dim_state = pred_low_state_list[-1] if '4x4lowdim' in conf: dec4 = slim.layers.conv2d_transpose( # 8x8x1 low_dim_state, 1, [3, 3], stride=2, scope='convt0') else: dec4 = low_dim_state dec5 = slim.layers.conv2d_transpose( # 8x8x16 dec4, 16, 3, stride=1, scope='convt1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm5'}) dec6 = slim.layers.conv2d_transpose( # 16x16x16 dec5, 16, 3, stride=2, scope='convt2', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm6'}) if 'skip' in conf: dec6 = tf.concat(3, [dec6, enc1]) # both 16x16x16 + 16x16x16 dec7 = slim.layers.conv2d_transpose( # 16x16x32 dec6, 32, 3, stride=1, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm7'}) dec8 = slim.layers.conv2d_transpose( #32x32x32 dec7, 32, 3, stride=2, scope='convt4', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm8'}) if 'skip' in conf: dec8 = tf.concat(3, [dec8, enc0]) # both 32x32x32 + 32x32x32 dec9 = slim.layers.conv2d_transpose( #64x64x16 dec8, 16, 3, stride=2, scope='convt5', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) # Using largest hidden state for predicting untied conv kernels. dec10 = slim.layers.conv2d_transpose(dec9, DNA_KERN_SIZE**2, 1, stride=1, scope='convt6') if conf['model'] == 'STP': num_masks = conf['num_masks'] stp_input = tf.reshape(dec10, [int(batch_size), -1]) transformed = stp_transformation(prev_image, stp_input, num_masks) elif conf['model'] == 'DNA': transformed = [ dna_transformation(prev_image, dec10, DNA_KERN_SIZE) ] if 'use_masks' in conf: masks = slim.layers.conv2d_transpose(dec10, num_masks + 1, 1, stride=1, scope='convt7') masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [ int(batch_size), int(img_height), int(img_width), num_masks + 1 ]) mask_list = tf.split(3, num_masks + 1, masks) output = mask_list[0] * prev_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask else: mask_list = None [output] = transformed gen_images.append(output) gen_masks.append(mask_list) current_state = decode_low_dim_obs(conf, low_dim_state) gen_states.append(current_state) return gen_images, gen_states, gen_masks, inf_low_state_list, pred_low_state_list
def construct_model(images, actions=None, states=None, iter_num=-1.0, k=-1, use_state=True, num_masks=10, stp=False, cdna=True, dna=False, context_frames=2): """Build convolutional lstm video predictor using STP, CDNA, or DNA. Args: images: tensor of ground truth image sequences actions: tensor of action sequences states: tensor of ground truth state sequences iter_num: tensor of the current training iteration (for sched. sampling) k: constant used for scheduled sampling. -1 to feed in own prediction. use_state: True to include state and action in prediction num_masks: the number of different pixel motion predictions (and the number of masks for each of those predictions) stp: True to use Spatial Transformer Predictor (STP) cdna: True to use Convoluational Dynamic Neural Advection (CDNA) dna: True to use Dynamic Neural Advection (DNA) context_frames: number of ground truth frames to pass in before feeding in own predictions Returns: gen_images: predicted future image frames gen_states: predicted future states Raises: ValueError: if more than one network option specified or more than 1 mask specified for DNA model. """ if stp + cdna + dna != 1: raise ValueError('More than one, or no network option specified.') batch_size, img_height, img_width, color_channels = images[0].get_shape()[0:4] # images(10, 32, 64, 64, 3) axis changed lstm_func = basic_conv_lstm_cell # Generated robot states and images. gen_states, gen_images = [], [] current_state = states[0] if k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. num_ground_truth = tf.to_int32( tf.round(tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k))))) feedself = False # LSTM state sizes and states. lstm_size = np.int32(np.array([32, 32, 64, 64, 128, 64, 32])) lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None # out of the loop!!! lstm_state5, lstm_state6, lstm_state7 = None, None, None for image, action in zip(images[:-1], actions[:-1]): # images[0,1,2,...,8] , no last images[9] 32, 64, 64, 3 # Reuse variables after the first timestep. reuse = bool(gen_images) done_warm_start = len(gen_images) > context_frames - 1 with slim.arg_scope( [lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. prev_image = gen_images[-1] elif done_warm_start: # Scheduled sampling prev_image = scheduled_sample(image, gen_images[-1], batch_size, num_ground_truth) else: # Always feed in ground_truth prev_image = image # Predicted state is always fed back in state_action = tf.concat(axis=1, values=[action, current_state]) enc0 = slim.layers.conv2d( prev_image, 32, [5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func(enc0, lstm_state1, lstm_size[0], scope='state1') # 32 hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') hidden2, lstm_state2 = lstm_func(hidden1, lstm_state2, lstm_size[1], scope='state2')# 32 hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3') enc1 = slim.layers.conv2d(hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope='conv2') # input, num_output, kernels hidden3, lstm_state3 = lstm_func(enc1, lstm_state3, lstm_size[2], scope='state3')# 64 hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') hidden4, lstm_state4 = lstm_func(hidden3, lstm_state4, lstm_size[3], scope='state4')# 64 hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm5') enc2 = slim.layers.conv2d(hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv3') # Pass in state and action. smear = tf.reshape( state_action, [int(batch_size), 1, 1, int(state_action.get_shape()[1])]) smear = tf.tile( smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) if use_state: enc2 = tf.concat(axis=3, values=[enc2, smear]) enc3 = slim.layers.conv2d(enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden5, lstm_state5 = lstm_func(enc3, lstm_state5, lstm_size[4], scope='state5') # last 8x8 128 hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6') enc4 = slim.layers.conv2d_transpose(hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1') hidden6, lstm_state6 = lstm_func(enc4, lstm_state6, lstm_size[5], scope='state6') # 16x16 64 hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7') # Skip connection. hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose(hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2') hidden7, lstm_state7 = lstm_func(enc5, lstm_state7, lstm_size[6], scope='state7') # 32x32 32 hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8') # Skip connection. hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( hidden7, hidden7.get_shape()[3], 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) if dna: # Using largest hidden state for predicting untied conv kernels. enc7 = slim.layers.conv2d_transpose( enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4') else: # Using largest hidden state for predicting a new image layer. enc7 = slim.layers.conv2d_transpose( enc6, color_channels, 1, stride=1, scope='convt4') # This allows the network to also generate one image from scratch, # which is useful when regions of the image become unoccluded. transformed = [tf.nn.sigmoid(enc7)] if stp: stp_input0 = tf.reshape(hidden5, [int(batch_size), -1]) stp_input1 = slim.layers.fully_connected( stp_input0, 100, scope='fc_stp') transformed += stp_transformation(prev_image, stp_input1, num_masks) elif cdna: cdna_input = tf.reshape(hidden5, [int(batch_size), -1]) transformed += cdna_transformation(prev_image, cdna_input, num_masks, int(color_channels)) elif dna: # Only one mask is supported (more should be unnecessary). if num_masks != 1: raise ValueError('Only one mask is supported for DNA model.') transformed = [dna_transformation(prev_image, enc7)] masks = slim.layers.conv2d_transpose( enc6, num_masks + 1, 1, stride=1, scope='convt7') masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [int(batch_size), int(img_height), int(img_width), num_masks + 1]) mask_list = tf.split(axis=3, num_or_size_splits=num_masks + 1, value=masks) output = mask_list[0] * prev_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask gen_images.append(output) current_state = slim.layers.fully_connected( state_action, int(current_state.get_shape()[1]), scope='state_pred', activation_fn=None) gen_states.append(current_state) return gen_images, gen_states
def construct_model(images, actions=None, states=None, iter_num=-1.0, k=-1, use_state=True, num_masks=10, stp=False, cdna=True, dna=False, context_frames=2, pix_distributions=None, conf=None): """Build convolutional lstm video predictor using STP, CDNA, or DNA. Args: images: tensor of ground truth image sequences actions: tensor of action sequences states: tensor of ground truth state sequences iter_num: tensor of the current training iteration (for sched. sampling) k: constant used for scheduled sampling. -1 to feed in own prediction. use_state: True to include state and action in prediction num_masks: the number of different pixel motion predictions (and the number of masks for each of those predictions) stp: True to use Spatial Transformer Predictor (STP) cdna: True to use Convoluational Dynamic Neural Advection (CDNA) dna: True to use Dynamic Neural Advection (DNA) context_frames: number of ground truth frames to pass in before feeding in own predictions pix_distrib: the initial one-hot distriubtion for designated pixels Returns: gen_images: predicted future image frames gen_states: predicted future states Raises: ValueError: if more than one network option specified or more than 1 mask specified for DNA model. """ if 'dna_size' in conf.keys(): DNA_KERN_SIZE = conf['dna_size'] else: DNA_KERN_SIZE = 5 print 'constructing network with less layers...' if stp + cdna + dna != 1: raise ValueError('More than one, or no network option specified.') batch_size, img_height, img_width, color_channels = images[0].get_shape( )[0:4] batch_size = int(batch_size) lstm_func = basic_conv_lstm_cell # Generated robot states and images. gen_states, gen_images, gen_masks, inf_low_state, pred_low_state = [], [], [], [], [] current_state = states[0] gen_pix_distrib = [] summaries = [] if k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. num_ground_truth = tf.to_int32( tf.round( tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k))))) feedself = False # LSTM state sizes and states. lstm_size = np.int32(np.array([16, 32, 64, 100, 10])) lstm_state1, lstm_state2, lstm_state3 = None, None, None single_lstm1 = BasicLSTMCell(lstm_size[3], state_is_tuple=True) single_lstm2 = BasicLSTMCell(lstm_size[4], state_is_tuple=True) low_dim_lstm = MultiRNNCell([single_lstm1, single_lstm2], state_is_tuple=True) low_dim_lstm_state = low_dim_lstm.zero_state(batch_size, tf.float32) dim_low_state = int(lstm_size[-1]) t = -1 for image, action in zip(images[:-1], actions[:-1]): t += 1 print 'building timestep ', t # Reuse variables after the first timestep. reuse = bool(gen_images) done_warm_start = len(gen_images) > context_frames - 1 with slim.arg_scope([ lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose ], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. prev_image = gen_images[-1] if pix_distributions != None: prev_pix_distrib = gen_pix_distrib[-1] elif done_warm_start: # Scheduled sampling prev_image = scheduled_sample(image, gen_images[-1], batch_size, num_ground_truth) else: # Always feed in ground_truth prev_image = image if pix_distributions != None: prev_pix_distrib = pix_distributions[t] prev_pix_distrib = tf.expand_dims(prev_pix_distrib, -1) # Predicted state is always fed back in state_action = tf.concat(1, [action, current_state]) # 6x import pdb pdb.set_trace() enc0 = slim.layers.conv2d( #32x32x32 prev_image, 32, kernel_size=[5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( #32x32 enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') enc1 = slim.layers.conv2d( #16x16 hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden2, lstm_state2 = lstm_func( #16x16x32 enc1, lstm_state2, lstm_size[1], scope='state3') hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm4') enc2 = slim.layers.conv2d( #8x8x32 hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope='conv3') # Pass in state and action. smear = tf.reshape( state_action, [batch_size, 1, 1, int(state_action.get_shape()[1])]) smear = tf.tile( #8x8x6 smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) if use_state: enc2 = tf.concat(3, [enc2, smear]) enc3 = slim.layers.conv2d( #8x8x32 enc2, hidden2.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden3, lstm_state3 = lstm_func( #8x8x64 enc3, lstm_state3, lstm_size[2], scope='state5') # last 8x8 hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm6') enc3 = slim.layers.conv2d( # 8x8x32 hidden3, 16, [1, 1], stride=1, scope='conv5') enc3_flat = tf.reshape(enc3, [batch_size, -1]) if 'use_low_dim_lstm' in conf: with tf.variable_scope('low_dim_lstm', reuse=reuse): hidden4, low_dim_lstm_state = low_dim_lstm( enc3_flat, low_dim_lstm_state) low_dim_state = hidden4 else: enc_fully1 = slim.layers.fully_connected(enc3_flat, 400, scope='enc_fully1') enc_fully2 = slim.layers.fully_connected(enc_fully1, 100, scope='enc_fully2') low_dim_state = enc_fully2 # inferred low dimensional state: inf_low_state.append(low_dim_state) pred_low_state.append(project_fwd_lowdim(low_dim_state)) smear = tf.reshape(low_dim_state, [batch_size, 1, 1, dim_low_state]) smear = tf.tile( # 8x8xdim_hidden_state smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) enc4 = slim.layers.conv2d_transpose( #16x16x32 smear, hidden3.get_shape()[3], 3, stride=2, scope='convt1') enc5 = slim.layers.conv2d_transpose( #32x32x32 enc4, enc0.get_shape()[3], 3, stride=2, scope='convt2') enc6 = slim.layers.conv2d_transpose( #64x64x16 enc5, 16, 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) # Using largest hidden state for predicting untied conv kernels. enc7 = slim.layers.conv2d_transpose(enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4') # Only one mask is supported (more should be unnecessary). if num_masks != 1: raise ValueError('Only one mask is supported for DNA model.') transformed = [dna_transformation(prev_image, enc7, DNA_KERN_SIZE)] if 'use_masks' in conf: masks = slim.layers.conv2d_transpose(enc6, num_masks + 1, 1, stride=1, scope='convt7') masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [ int(batch_size), int(img_height), int(img_width), num_masks + 1 ]) mask_list = tf.split(3, num_masks + 1, masks) output = mask_list[0] * prev_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask else: mask_list = None output = transformed gen_images.append(output) gen_masks.append(mask_list) if dna and pix_distributions != None: transf_distrib = [ dna_transformation(prev_pix_distrib, enc7, DNA_KERN_SIZE) ] if pix_distributions != None: pix_distrib_output = mask_list[0] * prev_pix_distrib mult_list = [] for i in range(num_masks): mult_list.append(transf_distrib[i] * mask_list[i + 1]) pix_distrib_output += mult_list[i] gen_pix_distrib.append(pix_distrib_output) # pred_low_state_stopped = tf.stop_gradient(pred_low_state) state_enc1 = slim.layers.fully_connected( # pred_low_state[-1], low_dim_state, 100, scope='state_enc1') state_enc2 = slim.layers.fully_connected( state_enc1, # int(current_state.get_shape()[1]), 4, scope='state_enc2', activation_fn=None) current_state = tf.squeeze(state_enc2) gen_states.append(current_state) if pix_distributions != None: return gen_images, gen_states, gen_masks, gen_pix_distrib, inf_low_state, pred_low_state else: return gen_images, gen_states, gen_masks, None, inf_low_state, pred_low_state
def decoder_model(hidden_repr, sequence_length, initializer, num_channels=3, scope='decoder', fc_conv_layer=False): """ Args: hidden_repr: Tensor of latent space representation sequence_length: number of frames that shall be decoded from the hidden_repr num_channels: number of channels for generated frames initializer: specifies the initialization type (default: contrib.slim.layers uses Xavier init with uniform data) fc_conv_layer: adds an fc layer at the end of the encoder Returns: frame_gen: array of generated frames (Tensors) fc_conv_layer: indicates whether hidden_repr is 1x1xdepth tensor a and fully concolutional layer shall be added """ frame_gen = [] lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5, lstm_state0 = None, None, None, None, None, None assert (not fc_conv_layer) or (hidden_repr.get_shape()[1] == hidden_repr.get_shape()[2] == 1) for i in range(sequence_length): reuse = (i > 0) #reuse variables (recurrence) after first time step with tf.variable_scope(scope, reuse=reuse): #Fully Convolutional Layer (1x1xFC_LAYER_SIZE -> 4x4x16) hidden0, lstm_state0 = basic_conv_lstm_cell(hidden_repr, lstm_state0, FC_LAYER_SIZE, initializer, filter_size=1, scope='convlstm0') fc_conv = slim.layers.conv2d_transpose(hidden0, 32, [4, 4], stride=1, scope='fc_conv', padding='VALID', weights_initializer=initializer) #LAYER 1: convLSTM1 hidden1, lstm_state1 = basic_conv_lstm_cell(fc_conv, lstm_state1, 32, initializer, filter_size=3, scope='convlstm1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm1') #LAYER 2: upconv1 (8x8 -> 16x16) upconv1 = slim.layers.conv2d_transpose(hidden1, hidden1.get_shape()[3], 3, stride=2, scope='upconv1', weights_initializer=initializer, normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm2'}) #LAYER 3: convLSTM2 hidden2, lstm_state2 = basic_conv_lstm_cell(upconv1, lstm_state2, 32, initializer, filter_size=3, scope='convlstm2') hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3') #LAYER 4: upconv2 (16x16 -> 32x32) upconv2 = slim.layers.conv2d_transpose(hidden2, hidden2.get_shape()[3], 3, stride=2, scope='upconv2', weights_initializer=initializer, normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm4'}) #LAYER 5: convLSTM3 hidden3, lstm_state3 = basic_conv_lstm_cell(upconv2, lstm_state3, 16, initializer, filter_size=3, scope='convlstm3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm5') # LAYER 6: upconv3 (32x32 -> 64x64) upconv3 = slim.layers.conv2d_transpose(hidden3, hidden3.get_shape()[3], 5, stride=2, scope='upconv3', weights_initializer=initializer, normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm6'}) #LAYER 7: convLSTM4 hidden4, lstm_state4 = basic_conv_lstm_cell(upconv3, lstm_state4, 16, initializer, filter_size=5, scope='convlstm4') hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm7') #Layer 8: upconv4 (64x64 -> 128x128) upconv4 = slim.layers.conv2d_transpose(hidden4, 16, 5, stride=2, scope='upconv4', normalizer_fn=tf_layers.layer_norm, weights_initializer=initializer, normalizer_params={'scope': 'layer_norm8'}) #LAYER 9: convLSTM5 hidden5, lstm_state5 = basic_conv_lstm_cell(upconv4, lstm_state5, 16, initializer, filter_size=5, scope='convlstm5') hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm9') upconv5 = slim.layers.conv2d_transpose(hidden5, num_channels, 5, stride=2, scope='upconv5', weights_initializer=initializer) frame_gen.append(upconv5) assert len(frame_gen)==sequence_length return frame_gen
def construct_model(images, actions=None, states=None, iter_num=-1.0, k=-1, use_state=True, num_masks=10, stp=False, cdna=True, dna=False, context_frames=2, conf=None): if 'dna_size' in conf.keys(): DNA_KERN_SIZE = conf['dna_size'] else: DNA_KERN_SIZE = 5 print 'constructing network with less layers...' if stp + cdna + dna != 1: raise ValueError('More than one, or no network option specified.') batch_size, img_height, img_width, color_channels = images[0].get_shape( )[0:4] lstm_func = basic_conv_lstm_cell # Generated robot states and images. gen_states, gen_images, gen_masks, gen_poses = [], [], [], [] summaries = [] if k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. num_ground_truth = tf.to_int32( tf.round( tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k))))) feedself = False # LSTM state sizes and states. if 'lstm_size' in conf: lstm_size = conf['lstm_size'] else: lstm_size = np.int32(np.array([16, 16, 32, 32, 64, 32, 16])) lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None lstm_state5, lstm_state6, lstm_state7 = None, None, None t = -1 for image, action, state in zip(images[:-1], actions[:-1], states[:-1]): t += 1 # Reuse variables after the first timestep. reuse = bool(gen_images) done_warm_start = len(gen_images) > context_frames - 1 with slim.arg_scope([ lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose ], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. prev_image = gen_images[-1] prev_state = gen_states[-1] elif done_warm_start: # Scheduled sampling prev_image = scheduled_sample(image, gen_images[-1], batch_size, num_ground_truth) prev_image = tf.reshape(prev_image, [conf['batch_size'], 64, 64, 3]) prev_state = scheduled_sample(state, gen_states[-1], batch_size, num_ground_truth) prev_state = tf.reshape(prev_state, [conf['batch_size'], 4]) else: # Always feed in ground_truth prev_image = image prev_state = state if 'transform_from_firstimage' in conf: assert stp if t > 1: prev_image = images[1] print 'using image 1' enc0 = slim.layers.conv2d( #32x32x32 prev_image, 32, [5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( # 32x32x16 enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') enc1 = slim.layers.conv2d( # 16x16x16 hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden3, lstm_state3 = lstm_func( #16x16x32 enc1, lstm_state3, lstm_size[2], scope='state3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') enc2 = slim.layers.conv2d( #8x8x32 hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv3') # Pass in state and action. # Predicted state is always fed back in state_action = tf.concat(1, [action, prev_state]) smear = tf.reshape( state_action, [int(batch_size), 1, 1, int(state_action.get_shape()[1])]) smear = tf.tile( smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) if use_state: enc2 = tf.concat(3, [enc2, smear]) enc3 = slim.layers.conv2d( #8x8x32 enc2, hidden3.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden5, lstm_state5 = lstm_func( #8x8x64 enc3, lstm_state5, lstm_size[4], scope='state5') hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6') enc4 = slim.layers.conv2d_transpose( #16x16x64 hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1') hidden6, lstm_state6 = lstm_func( #16x16x32 enc4, lstm_state6, lstm_size[5], scope='state6') hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7') if not 'noskip' in conf: # Skip connection. hidden6 = tf.concat(3, [hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( #32x32x32 hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2') hidden7, lstm_state7 = lstm_func( # 32x32x16 enc5, lstm_state7, lstm_size[6], scope='state7') hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8') if not 'noskip' in conf: # Skip connection. hidden7 = tf.concat(3, [hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( # 64x64x16 hidden7, hidden7.get_shape()[3], 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) if dna: # Using largest hidden state for predicting untied conv kernels. enc7 = slim.layers.conv2d_transpose(enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4') else: # Using largest hidden state for predicting a new image layer. enc7 = slim.layers.conv2d_transpose(enc6, color_channels, 1, stride=1, scope='convt4') # This allows the network to also generate one image from scratch, # which is useful when regions of the image become unoccluded. transformed = [tf.nn.sigmoid(enc7)] if stp: stp_input0 = tf.reshape(hidden5, [int(batch_size), -1]) stp_input1 = slim.layers.fully_connected(stp_input0, 100, scope='fc_stp') # disabling capability to generete pixels reuse_stp = None if reuse: reuse_stp = reuse transformed = stp_transformation(prev_image, stp_input1, num_masks, reuse_stp) # transformed += stp_transformation(prev_image, stp_input1, num_masks) elif dna: # Only one mask is supported (more should be unnecessary). if num_masks != 1: raise ValueError( 'Only one mask is supported for DNA model.') transformed = [ dna_transformation(prev_image, enc7, DNA_KERN_SIZE) ] masks = slim.layers.conv2d_transpose(enc6, num_masks + 1, 1, stride=1, scope='convt7') masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [ int(batch_size), int(img_height), int(img_width), num_masks + 1 ]) mask_list = tf.split(3, num_masks + 1, masks) output = mask_list[0] * prev_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask gen_images.append(output) gen_masks.append(mask_list) next_state, next_pose = predict_next_low_dim( conf, hidden7, enc0, state_action) gen_states.append(next_state) gen_poses.append(next_pose) return gen_images, gen_states, gen_poses
def encoder_decoder_fn(self, action, batch_size, input_image, lstm_func, lstm_size, lstm_states, state_action): """ :return: enc6: the representation use to construct the masks hidden5: the representation use to construct the CDNA kernels lstm_states: hidden lstm states """ lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5, lstm_state6, lstm_state7 = lstm_states enc0 = slim.layers.conv2d( # 32x32x32 input_image, 32, [5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( # 32x32x16 enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') enc1 = slim.layers.conv2d( # 16x16x16 hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden3, lstm_state3 = lstm_func( # 16x16x32 enc1, lstm_state3, lstm_size[1], scope='state3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') enc2 = slim.layers.conv2d( # 8x8x32 hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv3') if 'ignore_state_action' not in self.conf: # Pass in state and action. if 'ignore_state' in self.conf: lowdim = action print('ignoring state') else: lowdim = state_action smear = tf.reshape( lowdim, [int(batch_size), 1, 1, int(lowdim.get_shape()[1])]) smear = tf.tile( smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) enc2 = tf.concat(axis=3, values=[enc2, smear]) else: print('ignoring states and actions') enc3 = slim.layers.conv2d( # 8x8x32 enc2, hidden3.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden5, lstm_state5 = lstm_func( # 8x8x64 enc3, lstm_state5, lstm_size[2], scope='state5') hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6') enc4 = slim.layers.conv2d_transpose( # 16x16x64 hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1') hidden6, lstm_state6 = lstm_func( # 16x16x32 enc4, lstm_state6, lstm_size[3], scope='state6') hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7') if 'noskip' not in self.conf: # Skip connection. hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( # 32x32x32 hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2') hidden7, lstm_state7 = lstm_func( # 32x32x16 enc5, lstm_state7, lstm_size[4], scope='state7') hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8') if 'noskip' not in self.conf: # Skip connection. hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( # 64x64x16 hidden7, hidden7.get_shape()[3], 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) lstm_states = lstm_state1, lstm_state2, lstm_state3, lstm_state4, lstm_state5, lstm_state6, lstm_state7 return enc6, hidden5, lstm_states
def forward(images, index, dna, cdna, num_masks=10, reuse=None): stime = time.time() batch_size, img_height, img_width = images[0].get_shape()[0:3] lstm_func = basic_conv_lstm_cell # Generated robot states and images. gen_images = [] lstm_size = np.int32(np.array([32, 32, 64, 64, 128, 64, 32])) lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None lstm_state5, lstm_state6, lstm_state7 = None, None, None for i in range(images.__len__()): # Reuse variables after the first timestep. if i > 0: reuse = True with slim.arg_scope( [lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose], reuse=reuse): if i > index: prev_image = tf.reshape(gen_images[-1], [batch_size, img_height, img_width, 1]) else: prev_image = tf.reshape(images[i], [batch_size, img_height, img_width, 1]) enc0 = slim.layers.conv2d( prev_image, 32, 5, stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') hidden2, lstm_state2 = lstm_func( hidden1, lstm_state2, lstm_size[1], scope='state2') hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3') enc1 = slim.layers.conv2d( hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden3, lstm_state3 = lstm_func( enc1, lstm_state3, lstm_size[2], scope='state3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') hidden4, lstm_state4 = lstm_func( hidden3, lstm_state4, lstm_size[3], scope='state4') hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm5') enc2 = slim.layers.conv2d( hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv3') enc3 = slim.layers.conv2d( enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden5, lstm_state5 = lstm_func( enc3, lstm_state5, lstm_size[4], scope='state5') # last 8x8 hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6') enc4 = slim.layers.conv2d_transpose( hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1') hidden6, lstm_state6 = lstm_func( enc4, lstm_state6, lstm_size[5], scope='state6') # 16x16 hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7') # Skip connection. hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2') hidden7, lstm_state7 = lstm_func( enc5, lstm_state7, lstm_size[6], scope='state7') # 32x32 hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8') # Skip connection. hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( hidden7, hidden7.get_shape()[3], 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) if dna: # Using largest hidden state for predicting untied conv kernels. enc7 = slim.layers.conv2d_transpose( enc6, DNA_KERN_SIZE ** 2, 1, stride=1, scope='convt4') else: # Using largest hidden state for predicting a new image layer. enc7 = slim.layers.conv2d_transpose( enc6, 1, 1, stride=1, scope='convt4') # This allows the network to also generate one image from scratch, # which is useful when regions of the image become unoccluded. transformed = [tf.nn.sigmoid(enc7)] if cdna: cdna_input = tf.reshape(hidden5, [int(batch_size), -1]) transformed += cdna_transformation(prev_image, cdna_input, num_masks, 1) elif dna: # Only one mask is supported (more should be unnecessary). if num_masks != 1: raise ValueError('Only one mask is supported for DNA model.') transformed = [dna_transformation(prev_image, enc7)] masks = slim.layers.conv2d_transpose( enc6, num_masks + 1, 1, stride=1, scope='convt7') masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [int(batch_size), int(img_height), int(img_width), num_masks + 1]) mask_list = tf.split(axis=3, num_or_size_splits=num_masks + 1, value=masks) output = mask_list[0] * prev_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask if (i > index - 1): gen_images.append(output) # print(gen_images.name) print(time.time() - stime) return gen_images
def construct_model(images, actions=None, states=None, iter_num=-1.0, k=-1, use_state=True, num_masks=10, stp=False, cdna=True, dna=False, context_frames=2, pix_distributions=None, conf=None): """Build convolutional lstm video predictor using STP, CDNA, or DNA. Args: images: tensor of ground truth image sequences actions: tensor of action sequences states: tensor of ground truth state sequences iter_num: tensor of the current training iteration (for sched. sampling) k: constant used for scheduled sampling. -1 to feed in own prediction. use_state: True to include state and action in prediction num_masks: the number of different pixel motion predictions (and the number of masks for each of those predictions) stp: True to use Spatial Transformer Predictor (STP) cdna: True to use Convoluational Dynamic Neural Advection (CDNA) dna: True to use Dynamic Neural Advection (DNA) context_frames: number of ground truth frames to pass in before feeding in own predictions pix_distrib: the initial one-hot distriubtion for designated pixels Returns: gen_images: predicted future image frames gen_states: predicted future states Raises: ValueError: if more than one network option specified or more than 1 mask specified for DNA model. """ if 'dna_size' in conf.keys(): DNA_KERN_SIZE = conf['dna_size'] else: DNA_KERN_SIZE = 5 print 'constructing network with less layers...' if stp + cdna + dna != 1: raise ValueError('More than one, or no network option specified.') batch_size, img_height, img_width, color_channels = images[0].get_shape( )[0:4] lstm_func = basic_conv_lstm_cell # Generated robot states and images. gen_states, gen_images, gen_masks = [], [], [] current_state = states[0] gen_pix_distrib = [] summaries = [] if k == -1: feedself = True else: # Scheduled sampling: # Calculate number of ground-truth frames to pass in. num_ground_truth = tf.to_int32( tf.round( tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k))))) feedself = False # LSTM state sizes and states. if 'lstm_size' in conf: lstm_size = conf['lstm_size'] else: lstm_size = np.int32(np.array([16, 16, 32, 32, 64, 32, 16])) lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None lstm_state5, lstm_state6, lstm_state7 = None, None, None t = -1 for image, action in zip(images[:-1], actions[:-1]): t += 1 # Reuse variables after the first timestep. reuse = bool(gen_images) done_warm_start = len(gen_images) > context_frames - 1 with slim.arg_scope([ lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose ], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. prev_image = gen_images[-1] if pix_distributions != None: prev_pix_distrib = gen_pix_distrib[-1] elif done_warm_start: # Scheduled sampling prev_image = scheduled_sample(image, gen_images[-1], batch_size, num_ground_truth) else: # Always feed in ground_truth prev_image = image if pix_distributions != None: prev_pix_distrib = pix_distributions[t] prev_pix_distrib = tf.expand_dims(prev_pix_distrib, -1) if 'transform_from_firstimage' in conf: assert stp if t > 1: prev_image = images[1] print 'using image 1' # Predicted state is always fed back in state_action = tf.concat(1, [action, current_state]) enc0 = slim.layers.conv2d( #32x32x32 prev_image, 32, [5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( # 32x32x16 enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') # hidden2, lstm_state2 = lstm_func( # hidden1, lstm_state2, lstm_size[1], scope='state2') # hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3') enc1 = slim.layers.conv2d( # 16x16x16 hidden1, hidden1.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden3, lstm_state3 = lstm_func( #16x16x32 enc1, lstm_state3, lstm_size[2], scope='state3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') # hidden4, lstm_state4 = lstm_func( # hidden3, lstm_state4, lstm_size[3], scope='state4') # hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm5') enc2 = slim.layers.conv2d( #8x8x32 hidden3, hidden3.get_shape()[3], [3, 3], stride=2, scope='conv3') # Pass in state and action. smear = tf.reshape( state_action, [int(batch_size), 1, 1, int(state_action.get_shape()[1])]) smear = tf.tile( smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) if use_state: enc2 = tf.concat(3, [enc2, smear]) enc3 = slim.layers.conv2d( #8x8x32 enc2, hidden3.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden5, lstm_state5 = lstm_func( #8x8x64 enc3, lstm_state5, lstm_size[4], scope='state5') hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6') enc4 = slim.layers.conv2d_transpose( #16x16x64 hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1') hidden6, lstm_state6 = lstm_func( #16x16x32 enc4, lstm_state6, lstm_size[5], scope='state6') hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7') if not 'noskip' in conf: # Skip connection. hidden6 = tf.concat(3, [hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( #32x32x32 hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2') hidden7, lstm_state7 = lstm_func( # 32x32x16 enc5, lstm_state7, lstm_size[6], scope='state7') hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8') if not 'noskip' in conf: # Skip connection. hidden7 = tf.concat(3, [hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( # 64x64x16 hidden7, hidden7.get_shape()[3], 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) if dna: # Using largest hidden state for predicting untied conv kernels. enc7 = slim.layers.conv2d_transpose(enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4') else: # Using largest hidden state for predicting a new image layer. enc7 = slim.layers.conv2d_transpose(enc6, color_channels, 1, stride=1, scope='convt4') # This allows the network to also generate one image from scratch, # which is useful when regions of the image become unoccluded. transformed = [tf.nn.sigmoid(enc7)] if stp: stp_input0 = tf.reshape(hidden5, [int(batch_size), -1]) stp_input1 = slim.layers.fully_connected(stp_input0, 100, scope='fc_stp') # disabling capability to generete pixels reuse_stp = None if reuse: reuse_stp = reuse transformed = stp_transformation(prev_image, stp_input1, num_masks, reuse_stp) # transformed += stp_transformation(prev_image, stp_input1, num_masks) if pix_distributions != None: transf_distrib = stp_transformation(prev_pix_distrib, stp_input1, num_masks, reuse=True) elif cdna: cdna_input = tf.reshape(hidden5, [int(batch_size), -1]) new_transformed, new_cdna_filter = cdna_transformation( prev_image, cdna_input, num_masks, int(color_channels), reuse_sc=reuse) transformed += new_transformed summaries += make_cdna_kerns_summary(new_cdna_filter, t, 'image') if pix_distributions != None: transf_distrib, new_cdna_distrib_filter = cdna_transformation( prev_pix_distrib, cdna_input, num_masks, 1, reuse_sc=True) summaries += make_cdna_kerns_summary( new_cdna_distrib_filter, t, 'distrib') elif dna: # Only one mask is supported (more should be unnecessary). if num_masks != 1: raise ValueError( 'Only one mask is supported for DNA model.') transformed = [ dna_transformation(prev_image, enc7, DNA_KERN_SIZE) ] masks = slim.layers.conv2d_transpose(enc6, num_masks + 1, 1, stride=1, scope='convt7') masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [ int(batch_size), int(img_height), int(img_width), num_masks + 1 ]) mask_list = tf.split(3, num_masks + 1, masks) output = mask_list[0] * prev_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask gen_images.append(output) gen_masks.append(mask_list) if dna and pix_distributions != None: transf_distrib = [ dna_transformation(prev_pix_distrib, enc7, DNA_KERN_SIZE) ] if pix_distributions != None: pix_distrib_output = mask_list[0] * prev_pix_distrib mult_list = [] for i in range(num_masks): mult_list.append(transf_distrib[i] * mask_list[i + 1]) pix_distrib_output += mult_list[i] gen_pix_distrib.append(pix_distrib_output) current_state = slim.layers.fully_connected( state_action, int(current_state.get_shape()[1]), scope='state_pred', activation_fn=None) gen_states.append(current_state) if pix_distributions != None: return gen_images, gen_states, gen_masks, gen_pix_distrib else: return gen_images, gen_states, gen_masks, None
def construct_model(images, actions=None, states=None, iter_num=-1.0, k=-1, use_state=True, num_masks=10, stp=False, cdna=True, dna=False, context_frames=2): """Build convolutional lstm video predictor using STP, CDNA, or DNA. 使用STP,CDNA或DNA构建卷积lstm视频预测器。 Args: images: tensor of ground truth image sequences 真实图像序列张量 actions: tensor of action sequences 动作序列张量 states: tensor of ground truth state sequences 真实状态序列张量 iter_num: tensor of the current training iteration (for sched. sampling) 当前训练迭代的张量(用于计划采样) k: constant used for scheduled sampling. -1 to feed in own prediction. 用于计划采样的常数。 -1输入自己的预测。 use_state: True to include state and action in prediction 确实将状态和动作包括在预测中 num_masks: the number of different pixel motion predictions (and the number of masks for each of those predictions) 不同像素运动预测的数量(以及每个预测的掩模数量) stp: True to use Spatial Transformer Predictor (STP) 使用STP cdna: True to use Convoluational Dynamic Neural Advection (CDNA) 使用CDNA dna: True to use Dynamic Neural Advection (DNA) 使用DNA context_frames: number of ground truth frames to pass in before feeding in own predictions 传入真实图像的帧数,在输入自己预测之前 Returns: gen_images: predicted future image frames 预测未来图像帧 gen_states: predicted future states 预测未来状态 Raises: ValueError: if more than one network option specified or more than 1 mask specified for DNA model. 如果为DNA模型指定了多个网络选项或指定了多个掩码。 参数错误 """ if stp + cdna + dna != 1: raise ValueError('More than one, or no network option specified.') batch_size, img_height, img_width, color_channels = images[0].get_shape()[0:4] lstm_func = basic_conv_lstm_cell # Generated robot states and images. #生成机器人状态和图像 gen_states, gen_images = [], [] current_state = states[0] if k == -1: feedself = True else: # Scheduled sampling: #预定采样 # Calculate number of ground-truth frames to pass in. #计算传入的真实图像帧的数量 num_ground_truth = tf.to_int32( tf.round(tf.to_float(batch_size) * (k / (k + tf.exp(iter_num / k))))) feedself = False # LSTM state sizes and states. #LSTM状态大小和状态 lstm_size = np.int32(np.array([32, 32, 64, 64, 128, 64, 32])) lstm_state1, lstm_state2, lstm_state3, lstm_state4 = None, None, None, None lstm_state5, lstm_state6, lstm_state7 = None, None, None for image, action in zip(images[:-1], actions[:-1]): # Reuse variables after the first timestep. 在第一个时间步后重用变量 reuse = bool(gen_images) done_warm_start = len(gen_images) > context_frames - 1 with slim.arg_scope( [lstm_func, slim.layers.conv2d, slim.layers.fully_connected, tf_layers.layer_norm, slim.layers.conv2d_transpose], reuse=reuse): if feedself and done_warm_start: # Feed in generated image. 传入生成图像 prev_image = gen_images[-1] elif done_warm_start: # Scheduled sampling 预定采样 prev_image = scheduled_sample(image, gen_images[-1], batch_size, num_ground_truth) else: # Always feed in ground_truth 始终传入真实图像 prev_image = image # Predicted state is always fed back in 预测状态始终会反馈 state_action = tf.concat(axis=1, values=[action, current_state]) enc0 = slim.layers.conv2d( prev_image, 32, [5, 5], stride=2, scope='scale1_conv1', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm1'}) hidden1, lstm_state1 = lstm_func( enc0, lstm_state1, lstm_size[0], scope='state1') hidden1 = tf_layers.layer_norm(hidden1, scope='layer_norm2') hidden2, lstm_state2 = lstm_func( hidden1, lstm_state2, lstm_size[1], scope='state2') hidden2 = tf_layers.layer_norm(hidden2, scope='layer_norm3') enc1 = slim.layers.conv2d( hidden2, hidden2.get_shape()[3], [3, 3], stride=2, scope='conv2') hidden3, lstm_state3 = lstm_func( enc1, lstm_state3, lstm_size[2], scope='state3') hidden3 = tf_layers.layer_norm(hidden3, scope='layer_norm4') hidden4, lstm_state4 = lstm_func( hidden3, lstm_state4, lstm_size[3], scope='state4') hidden4 = tf_layers.layer_norm(hidden4, scope='layer_norm5') enc2 = slim.layers.conv2d( hidden4, hidden4.get_shape()[3], [3, 3], stride=2, scope='conv3') # Pass in state and action. 传递状态和动作 smear = tf.reshape( state_action, [int(batch_size), 1, 1, int(state_action.get_shape()[1])]) smear = tf.tile( smear, [1, int(enc2.get_shape()[1]), int(enc2.get_shape()[2]), 1]) if use_state: enc2 = tf.concat(axis=3, values=[enc2, smear]) enc3 = slim.layers.conv2d( enc2, hidden4.get_shape()[3], [1, 1], stride=1, scope='conv4') hidden5, lstm_state5 = lstm_func( enc3, lstm_state5, lstm_size[4], scope='state5') # last 8x8 hidden5 = tf_layers.layer_norm(hidden5, scope='layer_norm6') enc4 = slim.layers.conv2d_transpose( hidden5, hidden5.get_shape()[3], 3, stride=2, scope='convt1') hidden6, lstm_state6 = lstm_func( enc4, lstm_state6, lstm_size[5], scope='state6') # 16x16 hidden6 = tf_layers.layer_norm(hidden6, scope='layer_norm7') # Skip connection.跳跃连接 hidden6 = tf.concat(axis=3, values=[hidden6, enc1]) # both 16x16 enc5 = slim.layers.conv2d_transpose( hidden6, hidden6.get_shape()[3], 3, stride=2, scope='convt2') hidden7, lstm_state7 = lstm_func( enc5, lstm_state7, lstm_size[6], scope='state7') # 32x32 hidden7 = tf_layers.layer_norm(hidden7, scope='layer_norm8') # Skip connection. 跳跃连接 hidden7 = tf.concat(axis=3, values=[hidden7, enc0]) # both 32x32 enc6 = slim.layers.conv2d_transpose( hidden7, hidden7.get_shape()[3], 3, stride=2, scope='convt3', normalizer_fn=tf_layers.layer_norm, normalizer_params={'scope': 'layer_norm9'}) if dna: # Using largest hidden state for predicting untied conv kernels. 使用最大化隐藏状态 预测 卷积核 enc7 = slim.layers.conv2d_transpose( enc6, DNA_KERN_SIZE**2, 1, stride=1, scope='convt4') else: # Using largest hidden state for predicting a new image layer. 使用最大化隐藏状态预测一个新图层 enc7 = slim.layers.conv2d_transpose( enc6, color_channels, 1, stride=1, scope='convt4') # This allows the network to also generate one image from scratch,这样一来,网络也可以从头开始生成一张图片, # which is useful when regions of the image become unoccluded.当图像区域不被遮挡时,此功能很有用。 transformed = [tf.nn.sigmoid(enc7)] if stp: stp_input0 = tf.reshape(hidden5, [int(batch_size), -1]) stp_input1 = slim.layers.fully_connected( stp_input0, 100, scope='fc_stp') transformed += stp_transformation(prev_image, stp_input1, num_masks) elif cdna: cdna_input = tf.reshape(hidden5, [int(batch_size), -1]) transformed += cdna_transformation(prev_image, cdna_input, num_masks, int(color_channels)) elif dna: # Only one mask is supported (more should be unnecessary).仅支持一个掩码(应该没有必要更多)。 if num_masks != 1: raise ValueError('Only one mask is supported for DNA model.') transformed = [dna_transformation(prev_image, enc7)] masks = slim.layers.conv2d_transpose( enc6, num_masks + 1, 1, stride=1, scope='convt7') masks = tf.reshape( tf.nn.softmax(tf.reshape(masks, [-1, num_masks + 1])), [int(batch_size), int(img_height), int(img_width), num_masks + 1]) mask_list = tf.split(axis=3, num_or_size_splits=num_masks + 1, value=masks) output = mask_list[0] * prev_image for layer, mask in zip(transformed, mask_list[1:]): output += layer * mask gen_images.append(output) current_state = slim.layers.fully_connected( state_action, int(current_state.get_shape()[1]), scope='state_pred', activation_fn=None) gen_states.append(current_state) return gen_images, gen_states