loss_discrim = -tf.reduce_sum( discrim_target * tf.log(tf.clip_by_value(discrim_predictions, 1e-10, 1.0)) ) # Note: here use tf.reduce_sum, not use tf.reduce_mean loss_texture = -loss_discrim correct_predictions = tf.equal(tf.argmax(discrim_predictions, 1), tf.argmax(discrim_target, 1)) discim_accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32)) # 2) content loss CX_LAYER = 'conv4_2' enhanced_vgg = vgg.net(vgg_dir, vgg.preprocess(enhanced * 255)) dslr_vgg = vgg.net(vgg_dir, vgg.preprocess(dslr_image * 255)) # SSIM loss ssim_loss = 25 * (1 - utils.ssim(dslr_image, enhanced) / batch_size) # CX loss cx_loss = 4 * CX_loss_helper(dslr_vgg[CX_LAYER], enhanced_vgg[CX_LAYER], config_CX) # content loss loss_content = ssim_loss + cx_loss # 3) color loss enhanced_blur = utils.blur(enhanced)
def stylize(network, initial, initial_noiseblend, content, styles, preserve_colors, iterations, content_weight, content_weight_blend, style_weight, style_layer_weight_exp, style_blend_weights, tv_weight, learning_rate, beta1, beta2, epsilon, pooling, exp_sigma, mat_sigma, mat_rho, text_to_print, print_iterations=None, checkpoint_iterations=None, kernel=3, d=2, gamma_rho=1, gamma=1, rational_rho=1, alpha=1): tf.logging.set_verbosity(tf.logging.INFO) """ Stylize images. This function yields tuples (iteration, image); `iteration` is None if this is the final image (the last iteration). Other tuples are yielded every `checkpoint_iterations` iterations. :rtype: iterator[tuple[int|None,image]] 0 - dot product kernel 1 - exponential kernel 2 - matern kernel 3 - polynomial kernel """ shape = (1, ) + content.shape style_shapes = [(1, ) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] vgg_weights, vgg_mean_pixel = vgg.load_net(network) layer_weight = 1.0 style_layers_weights = {} for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = layer_weight layer_weight *= style_layer_weight_exp # normalize style layer weights layer_weights_sum = 0 for style_layer in STYLE_LAYERS: layer_weights_sum += style_layers_weights[style_layer] for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] /= layer_weights_sum # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/cpu'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net = vgg.net_preloaded(vgg_weights, image, pooling) content_pre = np.array([vgg.preprocess(content, vgg_mean_pixel)]) for layer in CONTENT_LAYERS: content_features[layer] = net[layer].eval( feed_dict={image: content_pre}) # compute style features in feedforward mode for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/cpu'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shapes[i]) net = vgg.net_preloaded(vgg_weights, image, pooling) style_pre = np.array([vgg.preprocess(styles[i], vgg_mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) # sqr = features.T*features.T # dim = features.shape if (kernel == 0): gram2 = np.matmul(features.T, features) / features.size elif (kernel == 1): gram2 = gramSquaredExp_np( features, exp_sigma) / features.size # exponential kernal elif (kernel == 2): gram2 = gramMatten_np( features, mat_sigma, v, mat_rho) / features.size # Mattern kernal elif (kernel == 3): print(d) gram2 = gramPoly_np(features, C=0, d=d) / features.size elif (kernel == 4): gram2 = gramGammaExp_np(features, gamma_rho, gamma) / features.size elif (kernel == 5): gram2 = gramRatioanlQuad_np(features, rational_rho, alpha) / features.size # print(features.shape,"diamention of feature\n") style_features[i][layer] = gram2 initial_content_noise_coeff = 1.0 - initial_noiseblend # make stylized image using backpropogation g = tf.Graph() with g.as_default(), g.device('/gpu'): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, vgg_mean_pixel)]) initial = initial.astype('float32') noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = (initial) * initial_content_noise_coeff + ( tf.random_normal(shape) * 0.256) * (1.0 - initial_content_noise_coeff) image = tf.Variable(initial) net = vgg.net_preloaded(vgg_weights, image, pooling) # content loss content_layers_weights = {} content_layers_weights['relu4_2'] = content_weight_blend content_layers_weights['relu5_2'] = 1.0 - content_weight_blend content_loss = 0 content_losses = [] for content_layer in CONTENT_LAYERS: content_losses.append( content_layers_weights[content_layer] * content_weight * (2 * tf.nn.l2_loss(net[content_layer] - content_features[content_layer]) / content_features[content_layer].size)) content_loss += reduce(tf.add, content_losses) # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) style_gram = style_features[i][style_layer] dim = feats.get_shape() # print(dim) sqr = tf.reduce_sum(tf.transpose(feats) * tf.transpose(feats), axis=1) d2 = tf.nn.relu( tf.transpose(tf.ones([dim[1], dim[1]]) * sqr) + tf.ones([dim[1], dim[1]]) * sqr - 2 * tf.matmul(tf.transpose(feats), feats)) if (kernel == 0): gram = (tf.matmul(tf.transpose(feats), feats)) / size elif (kernel == 1): gram = tf.exp( -1 * (tf.transpose(tf.ones([dim[1], dim[1]]) * sqr) + tf.ones([dim[1], dim[1]]) * sqr - 2 * tf.matmul(tf.transpose(feats), feats)) / 2 / (exp_sigma * exp_sigma)) / size # exponetial kernal elif (kernel == 2): # mattern kernal if (v == 0.5): gram = mat_sigma**2 * tf.exp( -1 * tf.sqrt(d2) / mat_rho) / size elif (v == 1.5): gram = mat_sigma**2 * (tf.ones([ dim[1], dim[1] ]) + tf.sqrt(3.0) * tf.sqrt(d2) / mat_rho) * tf.exp( -1 * tf.sqrt(3.0) * tf.sqrt(d2) / mat_rho) / size elif (v == 2.5): gram = mat_sigma**2 * ( tf.ones([dim[1], dim[1]]) + tf.sqrt(5.0) * tf.sqrt(d2) / mat_rho + 5 * d2 / 3 / (mat_rho**2)) * tf.exp( -1 * tf.sqrt(5.0) * tf.sqrt(d2) / mat_rho) / size elif (kernel == 3): # polynomial kernal gram = (tf.matmul(tf.transpose(feats), feats))**d / size elif (kernel == 4): # gamma exponental kernal gram = tf.exp(-1 * (tf.sqrt(d2) / gamma_rho)**gamma) / size elif (kernel == 5): # gamma exponental kernal gram = (1 + (d2 / rational_rho**2 / 2 / alpha))**(-1 * alpha) / size style_losses.append(style_layers_weights[style_layer] * 2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weights[i] * reduce( tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:, 1:, :, :]) tv_x_size = _tensor_size(image[:, :, 1:, :]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:, 1:, :, :] - image[:, :shape[1] - 1, :, :]) / tv_y_size) + (tf.nn.l2_loss(image[:, :, 1:, :] - image[:, :, :shape[2] - 1, :]) / tv_x_size)) # overall loss loss = content_loss + style_loss + tv_loss # optimizer setup # train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss) train_step = tf.train.AdamOptimizer(learning_rate, beta1, beta2, epsilon).minimize(loss) def print_progress(last_loss): new_loss = loss.eval() stderr.write('file ===> %s \n' % text_to_print) stderr.write(' content loss: %1.3e \t' % content_loss.eval()) stderr.write(' style loss: %1.3e \t' % style_loss.eval()) stderr.write(' tv loss: %1.3e \t' % tv_loss.eval()) stderr.write(' total loss: %1.3e \t' % new_loss) stderr.write(' loss difference: %1.3e \t\n' % (last_loss - new_loss)) return new_loss def save_progress(): dict = { "content loss": content_loss.eval(), "style loss": style_loss.eval(), "tv loss": tv_loss.eval(), "total loss": loss.eval() } return dict # optimization best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.global_variables_initializer()) stderr.write('Optimization started...\n') new_loss = 0 # if (print_iterations and print_iterations != 0): # print_progress() for i in range(iterations): train_step.run() last_step = (i == iterations - 1) if last_step or (print_iterations and i % print_iterations == 0): stderr.write('Iteration %4d/%4d\n' % (i + 1, iterations)) new_loss = print_progress(new_loss) if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: dict = save_progress() this_loss = loss.eval() print(this_loss, "loss in each check point") if this_loss < best_loss: best_loss = this_loss best = image.eval() try: img_out = vgg.unprocess(best.reshape(shape[1:]), vgg_mean_pixel) except: print( "uanlabe to result image due to given parameters") img_out = "no image" if preserve_colors and preserve_colors: original_image = np.clip(content, 0, 255) styled_image = np.clip(img_out, 0, 255) # Luminosity transfer steps: # 1. Convert stylized RGB->grayscale accoriding to Rec.601 luma (0.299, 0.587, 0.114) # 2. Convert stylized grayscale into YUV (YCbCr) # 3. Convert original image into YUV (YCbCr) # 4. Recombine (stylizedYUV.Y, originalYUV.U, originalYUV.V) # 5. Convert recombined image from YUV back to RGB # 1 styled_grayscale = rgb2gray(styled_image) styled_grayscale_rgb = gray2rgb(styled_grayscale) # 2 styled_grayscale_yuv = np.array( Image.fromarray( styled_grayscale_rgb.astype( np.uint8)).convert('YCbCr')) # 3 original_yuv = np.array( Image.fromarray(original_image.astype( np.uint8)).convert('YCbCr')) # 4 w, h, _ = original_image.shape combined_yuv = np.empty((w, h, 3), dtype=np.uint8) combined_yuv[..., 0] = styled_grayscale_yuv[..., 0] combined_yuv[..., 1] = original_yuv[..., 1] combined_yuv[..., 2] = original_yuv[..., 2] # 5 img_out = np.array( Image.fromarray(combined_yuv, 'YCbCr').convert('RGB')) yield ((None if last_step else i), img_out, dict)
def train(self): with tf.Session() as sess: out_im = self.U_net(self.holder[41] / 127.5 - 1) gt_resize = tf.image.resize_images(self.holder[42] / 127.5 - 1, [256, 256]) image_pre = vgg.preprocess(gt_resize) fai_imgt = {} net = vgg.net(self.vgg_path, image_pre) for layer in self.vgg_layer: fai_imgt[layer] = net[layer] image_pre = vgg.preprocess( tf.image.resize_images(out_im, [256, 256])) fai_imout = {} net = vgg.net(self.vgg_path, image_pre) for layer in self.vgg_layer: fai_imout[layer] = net[layer] Im_compt = self.holder[16] * self.holder[42] + (tf.add( tf.multiply(self.holder[16], -1), 1)) * ((out_im + 1) * 127.5) im_compt = tf.image.resize_images(Im_compt / 127.5 - 1, [256, 256]) image_pre = vgg.preprocess(im_compt) fai_compt = {} net = vgg.net(self.vgg_path, image_pre) for layer in self.vgg_layer: fai_compt[layer] = net[layer] U_vars = [ var for var in tf.trainable_variables() if 'UNET' in var.name ] total_loss = get_total_loss(out_im, self.holder[-1] / 127.5 - 1, self.holder[16], fai_imout, fai_imgt, fai_compt, self.vgg_layer, im_compt) optim = tf.train.AdamOptimizer() optimizer = optim.minimize(total_loss[0], var_list=U_vars) int_group = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess.run(int_group) graph = tf.summary.FileWriter(self.logdir, sess.graph) saver = tf.train.Saver(U_vars, max_to_keep=20) for epoch in range(self.num_epochs): for imid in range(int(self.total_ims // self.batch)): mask_ims, gt_ims = get_im(self.ims_dir, imid) self.get_all_mask(mask_ims, gt_ims) feed_dic = get_feedict(self.all_masks, self.holder) _, loss_total = sess.run([optimizer, total_loss], feed_dict=feed_dic) if (int(epoch * self.total_ims) + imid) % 1 == 0: print( 'epoch: %d, cur_num: %d, total_loss: %f, l_hole: %f, l_valid: %f, percept_loss: %f, style_loss_out: %f, style_loss_comp: %f, tv_loss: %f' % (epoch, imid, loss_total[0], loss_total[1], loss_total[2], loss_total[3], loss_total[4], loss_total[5], loss_total[6])) if epoch % 5 == 0: saver.save(sess, self.save_path + 'model.ckpt', global_step=epoch)
def stylize(network, initial, content, styles, iterations, content_weight, style_weight, style_blend_weights, tv_weight, learning_rate, print_iterations=None, checkpoint_iterations=None): """ Stylize images. This function yields tuples (iteration, image); `iteration` is None if this is the final image (the last iteration). Other tuples are yielded every `checkpoint_iterations` iterations. :rtype: iterator[tuple[int|None,image]] """ shape = (1,) + content.shape style_shapes = [(1,) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/gpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net, mean_pixel = vgg.net(network, image) content_pre = np.array([vgg.preprocess(content, mean_pixel)]) content_features[CONTENT_LAYER] = net[CONTENT_LAYER].eval( feed_dict={image: content_pre}) # compute style features in feedforward mode for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/gpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shapes[i]) net, _ = vgg.net(network, image) style_pre = np.array([vgg.preprocess(styles[i], mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[i][layer] = gram # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, mean_pixel)]) initial = initial.astype('float32') image = tf.Variable(initial) net, _ = vgg.net(network, image) # content loss content_loss = content_weight * (2 * tf.nn.l2_loss( net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_features[CONTENT_LAYER].size) # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / size style_gram = style_features[i][style_layer] style_losses.append(2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weights[i] * reduce(tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:,1:,:,:]) tv_x_size = _tensor_size(image[:,:,1:,:]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:,1:,:,:] - image[:,:shape[1]-1,:,:]) / tv_y_size) + (tf.nn.l2_loss(image[:,:,1:,:] - image[:,:,:shape[2]-1,:]) / tv_x_size)) # overall loss loss = content_loss + style_loss + tv_loss # optimizer setup train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) def print_progress(i, last=False): stderr.write('Iteration %d/%d\n' % (i + 1, iterations)) if last or (print_iterations and i % print_iterations == 0): stderr.write(' content loss: %g\n' % content_loss.eval()) stderr.write(' style loss: %g\n' % style_loss.eval()) stderr.write(' tv loss: %g\n' % tv_loss.eval()) stderr.write(' total loss: %g\n' % loss.eval()) # optimization best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for i in range(iterations): last_step = (i == iterations - 1) print_progress(i, last=last_step) train_step.run() if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() yield ( (None if last_step else i), vgg.unprocess(best.reshape(shape[1:]), mean_pixel) )
def optimize(content_targets, style_targets, content_weight, style_weight, tv_weight, vgg_path, epochs=2, print_iterations=1, batch_size=4, save_path='saver/fns.ckpt', slow=False, learning_rate=1e-3, debug=False, save_checkpoint=False, restore_checkpoint_path=None): if slow: batch_size = 1 mod = len(content_targets) % batch_size if mod > 0: print("Train set has been trimmed slightly..") content_targets = content_targets[:-mod] # style_features = collections.defaultdict() style_features = [] batch_shape = (batch_size, 256, 256, 5) style_shape = (1, 256, 256, 3) # print(style_shape) # precompute style features with tf.Graph().as_default(), tf.device('/cpu:0'), tf.Session() as sess: # with tf.Graph().as_default(), tf.Session() as sess: style_image = tf.placeholder(tf.float32, shape=style_shape, name='style_image') style_image_pre = vgg.preprocess(style_image) net = vgg.net(vgg_path, style_image_pre) for i in range(len(style_targets)): index = 0 style_pre = np.array([style_targets[i]]) # current_style_feature = [] current_style_feature = np.array([]) for layer in STYLE_LAYERS: # print(layer) features = net[layer].eval(feed_dict={style_image: style_pre}) # print(features.shape) features = np.reshape(features, (-1, features.shape[3])) # print(features.shape) # gram = np.matmul(features.T, features) / features.size gram = np.matmul(features.T - np.mean(features.T), features - np.mean(features)) / features.size # print(gram.shape) if not STYLE_LAYERS_SHAPE[STYLE_LAYERS.index(layer)]: STYLE_LAYERS_SHAPE[STYLE_LAYERS.index(layer)] = gram.shape if not STYLE_LAYERS_SIZE[STYLE_LAYERS.index(layer)]: STYLE_LAYERS_SIZE[STYLE_LAYERS.index(layer)] = gram.size if STYLE_LAYERS_INDEX[STYLE_LAYERS.index(layer)] == -1: STYLE_LAYERS_INDEX[STYLE_LAYERS.index(layer)] = index index = index + gram.size # style_features[i][layer] = gram # current_style_feature.append(gram.tolist()) # current_style_feature.append(gram.reshape(-1)) current_style_feature = np.append(current_style_feature, gram.reshape(-1)) # style_features.append(np.array(current_style_feature).reshape(-1)) style_features.append(current_style_feature) style_features = np.array(style_features, dtype=np.float32) # tf.convert_to_tensor(style_features) with tf.Graph().as_default(), tf.Session() as sess: lambda_style = tf.placeholder(tf.float32, name="lambda_style") style_id = tf.placeholder(tf.int32, name="style_id") X_content = tf.placeholder(tf.float32, shape=batch_shape, name="X_content") X_pre = vgg.preprocess(X_content[:, :, :, 0:3]) # precompute content features content_features = {} content_net = vgg.net(vgg_path, X_pre) content_features[CONTENT_LAYER] = content_net[CONTENT_LAYER] if slow: preds = tf.Variable( tf.random_normal(X_content.get_shape()) * 0.256) preds_pre = preds else: preds = transform.net(X_content / 255.0) preds_pre = vgg.preprocess(preds) net = vgg.net(vgg_path, preds_pre) content_size = _tensor_size( content_features[CONTENT_LAYER]) * batch_size assert _tensor_size(content_features[CONTENT_LAYER]) == _tensor_size( net[CONTENT_LAYER]) # content_loss = (1 - lambda_style) * (2 * tf.nn.l2_loss( # net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_size # ) content_loss = content_weight * ( 2 * tf.nn.l2_loss(net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_size ) # original style_losses = [] for style_layer in STYLE_LAYERS: # print(style_layer) layer = net[style_layer] bs, height, width, filters = map(lambda i: i.value, layer.get_shape()) size = height * width * filters feats = tf.reshape(layer, (bs, height * width, filters)) feats_T = tf.transpose(feats, perm=[0, 2, 1]) # grams = tf.matmul(feats_T, feats) / size grams = tf.matmul(feats_T - tf.reduce_mean(feats_T), feats - tf.reduce_mean(feats)) / size # test = lambda_style.eval(session=sess) # print('test : ' + test) # style_gram = style_features[style_id.eval(session=sess)][style_layer] # style_gram = style_features[0][style_layer] # s_id = sess.run(style_id) # style_gram = style_features[0][style_layer] # style_features = [tf.convert_to_tensor(x) for x in style_features] # s_gram = tf.gather_nd(style_features, style_id) # style_gram = s_gram[style_layer] # make style_features into tensor # keys = [] # values = [] # # for k, v in style_features.items(): # keys.append(k) # values.append(k) # style_features_tf = tf.contrib.lookup.HashTable( # initializer=tf.contrib.lookup.KeyValueTensorInitializer( # keys=tf.constant(range(len(style_targets))), # # values=tf.constant([style_features[i] for i in range(len(style_targets))]), # # values=tf.constant(style_targets), # values=style_targets, # ), # default_value=tf.constant(-1), # name="style_features_tf" # ) # current_style_features = style_features_tf.lookup(style_id) # style_gram = tf.gather_nd(current_style_features, STYLE_LAYERS.index(style_layer)) # style_features_tf.init.run() # print(style_gram.eval()) # print(style_gram.eval()) style_index = STYLE_LAYERS.index(style_layer) style_grams = tf.gather_nd(tf.constant(style_features), [style_id]) style_gram = style_grams[STYLE_LAYERS_INDEX[style_index]: STYLE_LAYERS_INDEX[style_index] + STYLE_LAYERS_SIZE[style_index]] style_gram = tf.reshape(style_gram, STYLE_LAYERS_SHAPE[style_index]) # style_grams = style_features[style_id] # style_losses.append(2 * tf.nn.l2_loss(grams - style_gram)/style_gram.size) style_losses.append(2 * tf.nn.l2_loss(grams - style_gram) / STYLE_LAYERS_SIZE[style_index]) style_loss = lambda_style * functools.reduce(tf.add, style_losses) / batch_size # style_loss = style_weight * functools.reduce(tf.add, style_losses) / batch_size # original # total variation denoising tv_y_size = _tensor_size(preds[:, 1:, :, :]) tv_x_size = _tensor_size(preds[:, :, 1:, :]) y_tv = tf.nn.l2_loss(preds[:, 1:, :, :] - preds[:, :batch_shape[1] - 1, :, :]) x_tv = tf.nn.l2_loss(preds[:, :, 1:, :] - preds[:, :, :batch_shape[2] - 1, :]) tv_loss = tv_weight * 2 * (x_tv / tv_x_size + y_tv / tv_y_size) / batch_size loss = content_loss + style_loss + tv_loss # overall loss train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) sess.run(tf.global_variables_initializer()) import random uid = random.randint(1, 100) print("UID: %s" % uid) if restore_checkpoint_path: print('Restoring checkpoint from : ' + restore_checkpoint_path) saver = tf.train.Saver() saver.restore(sess, restore_checkpoint_path) for epoch in range(epochs): if save_checkpoint and epoch > 0: saver = tf.train.Saver() head, tail = os.path.split(save_path) cp_dir = os.path.join(head, str(epoch)) if not os.path.exists(cp_dir): os.makedirs(cp_dir) cp_path = os.path.join(cp_dir, tail) print('save checkpoint to : ' + cp_path) res = saver.save(sess, cp_path) print('epoch: {}'.format(epoch)) num_examples = len(content_targets) iterations = 0 while iterations * batch_size < num_examples: # print('iterations: {}'.format(iterations)) start_time = time.time() curr = iterations * batch_size curr_lambda_style = np.random.randint(1, 100) * 1.0 curr_lambda_style_img = np.ones( (256, 256, 1)) * curr_lambda_style curr_style_id = np.random.randint( len(style_targets)) if epoch > 0 else 0 # print('\ncurr_style_id:') # print(type(curr_style_id)) # print(curr_style_id) curr_style_channel = np.ones((256, 256, 1)) * curr_style_id step = curr + batch_size X_batch = np.zeros(batch_shape, dtype=np.float32) for j, img_p in enumerate(content_targets[curr:step]): try: curr_img = get_img(img_p, (256, 256, 3)).astype(np.float32) except Exception: continue X_batch[j, :, :, 0:3] = curr_img X_batch[j, :, :, 3:] = curr_lambda_style_img X_batch[j, :, :, 4:] = curr_style_channel iterations += 1 assert X_batch.shape[0] == batch_size feed_dict = { X_content: X_batch, lambda_style: curr_lambda_style, style_id: curr_style_id } train_step.run(feed_dict=feed_dict) end_time = time.time() delta_time = end_time - start_time if debug: print("UID: %s, batch time: %s" % (uid, delta_time)) is_print_iter = int(iterations) % print_iterations == 0 if slow: is_print_iter = epoch % print_iterations == 0 is_last = epoch == epochs - 1 and iterations * batch_size >= num_examples should_print = is_print_iter or is_last if should_print: to_get = [style_loss, content_loss, tv_loss, loss, preds] test_feed_dict = { X_content: X_batch, lambda_style: 80.0, style_id: 0 # np.random.randint(1, 10) / 10.0 } tup = sess.run(to_get, feed_dict=test_feed_dict) _style_loss, _content_loss, _tv_loss, _loss, _preds = tup losses = (_style_loss, _content_loss, _tv_loss, _loss) # print(losses) if slow: _preds = vgg.unprocess(_preds) else: saver = tf.train.Saver() res = saver.save(sess, save_path) yield (_preds, losses, iterations, epoch)
def main(): parser = build_parser() options, unknown = parser.parse_known_args() env = os.environ.copy() print("options: ", options) vgg_path = options.dataset + '/vgg/imagenet-vgg-verydeep-19.mat' model_name = options.style_image.replace('.jpg', '.ckpt') style_image = options.dataset + '/style_images/' + options.style_image training_path = options.dataset + '/train' model_dir = env.get("OUTPUT_DIR", options.ckpt) tensorboard_dir = env.get("LOG_DIR", options.dataset) print("style_image: ", style_image) print("vgg: ", vgg_path) print("trainingpath: ", training_path) print("modelname: ", model_name) if options.gpu == None: available_gpus = get_available_gpus() if len(available_gpus) > 0: device = '/gpu:0' else: device = '/cpu:0' else: if options.gpu > -1: device = '/gpu:{}'.format(options.gpu) else: device = '/cpu:0' batchsize = options.batchsize # content targets content_targets = [os.path.join(training_path, fn) for fn in list_files(training_path)] if len(content_targets) % batchsize != 0: content_targets = content_targets[:-(len(content_targets) % batchsize)] print('total training data size: ', len(content_targets)) batch_shape = (batchsize,224,224,3) # style target style_target = read_img(style_image) style_shape = (1,) + style_target.shape with tf.device(device), tf.Session() as sess: # style target feature # compute gram maxtrix of style target style_image = tf.placeholder(tf.float32, shape=style_shape, name='style_image') vggstyletarget = vgg.net(vgg_path, vgg.preprocess(style_image)) style_vgg = vgg.get_style_vgg(vggstyletarget, style_image, np.array([style_target])) # content target feature content_vgg = {} inputs = tf.placeholder(tf.float32, shape=batch_shape, name="inputs") content_net = vgg.net(vgg_path, vgg.preprocess(inputs)) content_vgg['relu4_2'] = content_net['relu4_2'] # feature after transformation outputs = stylenet.net(inputs/255.0) vggoutputs = vgg.net(vgg_path, vgg.preprocess(outputs)) # compute feature loss loss_f = options.lambda_feat * vgg.total_content_loss(vggoutputs, content_vgg, batchsize) # compute style loss loss_s = options.lambda_style * vgg.total_style_loss(vggoutputs, style_vgg, batchsize) # total variation denoising loss_tv = options.lambda_tv * vgg.total_variation_regularization(outputs, batchsize, batch_shape) # total loss loss = loss_f + loss_s + loss_tv with tf.Session() as sess: if not os.path.exists(options.ckpt): os.makedirs(options.ckpt) save_path = model_dir + '/' + model_name # op to write logs to Tensorboard #training train_step = tf.train.AdamOptimizer(options.lr).minimize(loss) sess.run(tf.global_variables_initializer()) total_step = 0 for epoch in range(options.epoch): print('epoch: ', epoch) step = 0 while step * batchsize < len(content_targets): time_start = time.time() batch = np.zeros(batch_shape, dtype=np.float32) for i, img in enumerate(content_targets[step * batchsize : (step + 1) * batchsize]): batch[i] = read_img(img).astype(np.float32) # (224,224,3) step += 1 total_step += 1 loss_, summary= sess.run([loss, train_step,], feed_dict= {inputs:batch}) time_elapse = time.time() - time_start should_save = total_step % 2000 == 0 if total_step % 1 == 0: print('[step {}] elapse time: {} loss: {}'.format(total_step, time_elapse, loss_)) if should_save: print('Saving checkpoint') saver = tf.train.Saver() res = saver.save(sess, save_path) print('Saving final result to ' + save_path) saver = tf.train.Saver() res = saver.save(sess, save_path)
def stylyze(options, callback): parser = build_parser() if options is None: key = 'TF_CPP_MIN_LOG_LEVEL' if key not in os.environ: os.environ[key] = '2' options = parser.parse_args() if not os.path.isfile(options.network): parser.error("Network %s does not exist. (Did you forget to " "download it?)" % options.network) if [options.checkpoint_iterations, options.checkpoint_output].count(None) == 1: parser.error("use either both of checkpoint_output and " "checkpoint_iterations or neither") if options.checkpoint_output is not None: if re.match(r'^.*(\{.*\}|%.*).*$', options.checkpoint_output) is None: parser.error("To save intermediate images, the checkpoint_output " "parameter must contain placeholders (e.g. " "`foo_{}.jpg` or `foo_%d.jpg`") content_image_arr = [imread(i) for i in options.content] style_images = [imread(style) for style in options.styles] width_arr = options.width for i in range(len(content_image_arr)): width = width_arr[i] content_image = content_image_arr[i] if width is not None: new_shape = (int( math.floor( float(content_image.shape[0]) / content_image.shape[1] * width)), width) content_image = scipy.misc.imresize(content_image, new_shape) content_image_arr[i] = content_image target_shape = content_image.shape for j in range(len(style_images)): style_scale = STYLE_SCALE if options.style_scales is not None: style_scale = options.style_scales[j] style_images[j] = scipy.misc.imresize( style_images[j], style_scale * target_shape[1] / style_images[j].shape[1]) style_blend_weights = options.style_blend_weights if style_blend_weights is None: # default is equal weights style_blend_weights = [1.0 / len(style_images) for _ in style_images] else: total_blend_weight = sum(style_blend_weights) style_blend_weights = [ weight / total_blend_weight for weight in style_blend_weights ] initial_arr = content_image_arr # try saving a dummy image to the output path to make sure that it's writable output_arr = options.output for output in output_arr: if os.path.isfile(output) and not options.overwrite: raise IOError("%s already exists, will not replace it without " "the '--overwrite' flag" % output) try: imsave(output, np.zeros((500, 500, 3))) except: raise IOError('%s is not writable or does not have a valid file ' 'extension for an image file' % output) vgg_weights, vgg_mean_pixel = vgg.load_net(options.network) style_shapes = [(1, ) + style.shape for style in style_images] style_features = [{} for _ in style_images] layer_weight = 1.0 style_layers_weights = {} for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = layer_weight layer_weight *= options.style_layer_weight_exp # normalize style layer weights layer_weights_sum = 0 for style_layer in STYLE_LAYERS: layer_weights_sum += style_layers_weights[style_layer] for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] /= layer_weights_sum # compute style features in feedforward mode for i in range(len(style_images)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shapes[i]) net = vgg.net_preloaded(vgg_weights, image, options.pooling) style_pre = np.array( [vgg.preprocess(style_images[i], vgg_mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[i][layer] = gram initial_content_noise_coeff = 1.0 - options.initial_noiseblend for i in range(len(content_image_arr)): Data.save_step(Data.get_step() + 1) loss_arrs = None for iteration, image, loss_vals in stylize( initial=initial_arr[i], content=content_image_arr[i], preserve_colors=options.preserve_colors, iterations=options.iterations, content_weight=options.content_weight, content_weight_blend=options.content_weight_blend, tv_weight=options.tv_weight, learning_rate=options.learning_rate, beta1=options.beta1, beta2=options.beta2, epsilon=options.epsilon, pooling=options.pooling, initial_content_noise_coeff=initial_content_noise_coeff, style_images=style_images, style_layers_weights=style_layers_weights, style_weight=options.style_weight, style_blend_weights=style_blend_weights, vgg_weights=vgg_weights, vgg_mean_pixel=vgg_mean_pixel, style_features=style_features, print_iterations=options.print_iterations, checkpoint_iterations=options.checkpoint_iterations, callback=callback): if (image is not None) and (options.checkpoint_output is not None): imsave(fmt_imsave(options.checkpoint_output, iteration), image) if (loss_vals is not None) \ and (options.progress_plot or options.progress_write): if loss_arrs is None: itr = [] loss_arrs = OrderedDict( (key, []) for key in loss_vals.keys()) for key, val in loss_vals.items(): loss_arrs[key].append(val) itr.append(iteration) imsave(options.output[i], image) if options.progress_write: fn = "{}/progress.txt".format(os.path.dirname(options.output[i])) tmp = np.empty((len(itr), len(loss_arrs) + 1), dtype=float) tmp[:, 0] = np.array(itr) for ii, val in enumerate(loss_arrs.values()): tmp[:, ii + 1] = np.array(val) np.savetxt(fn, tmp, header=' '.join(['itr'] + list(loss_arrs.keys()))) if options.progress_plot: import matplotlib matplotlib.use('Agg') from matplotlib import pyplot as plt fig, ax = plt.subplots() for key, val in loss_arrs.items(): ax.semilogy(itr, val, label=key) ax.legend() ax.set_xlabel("iterations") ax.set_ylabel("loss") fig.savefig("{}/progress.png".format( os.path.dirname(options.output[i])))
def stylize(network, initial, content, styles, iterations, content_weight, style_weight, style_blend_weights, tv_weight, learning_rate, print_iterations=None, checkpoint_iterations=None, print_image_iterations=False): shape = (1,) + content.shape style_shapes = [(1,) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net, mean_pixel = vgg.net(network, image) content_pre = np.array([vgg.preprocess(content, mean_pixel)]) content_features[CONTENT_LAYER] = net[CONTENT_LAYER].eval( feed_dict={image: content_pre}) # compute style features in feedforward mode for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shapes[i]) net, _ = vgg.net(network, image) style_pre = np.array([vgg.preprocess(styles[i], mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[i][layer] = gram # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, mean_pixel)]) initial = initial.astype('float32') image = tf.Variable(initial) net, _ = vgg.net(network, image) # content loss content_loss = content_weight * (2 * tf.nn.l2_loss( net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_features[CONTENT_LAYER].size) # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / size style_gram = style_features[i][style_layer] style_losses.append(2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weights[i] * reduce(tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:,1:,:,:]) tv_x_size = _tensor_size(image[:,:,1:,:]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:,1:,:,:] - image[:,:shape[1]-1,:,:]) / tv_y_size) + (tf.nn.l2_loss(image[:,:,1:,:] - image[:,:,:shape[2]-1,:]) / tv_x_size)) # overall loss loss = content_loss + style_loss + tv_loss # optimizer setup train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) def print_progress(i, last=False): if print_iterations is not None: if i is not None and i % print_iterations == 0 or last: print >> stderr, ' content loss: %g' % content_loss.eval() print >> stderr, ' style loss: %g' % style_loss.eval() print >> stderr, ' tv loss: %g' % tv_loss.eval() print >> stderr, ' total loss: %g' % loss.eval() # optimization best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for i in range(iterations): print_progress(i) print >> stderr, 'Iteration %d/%d' % (i + 1, iterations) train_step.run() if (checkpoint_iterations is not None and i % checkpoint_iterations == 0) or i == iterations - 1: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() print_progress(None, i == iterations - 1) if (i % 100 == 0) and (print_image_iterations): temp_image = vgg.unprocess(best.reshape(shape[1:]), mean_pixel) temp_output = 'iteration_' + str(i) + '.jpg' imsave(temp_output, temp_image) return vgg.unprocess(best.reshape(shape[1:]), mean_pixel)
def stylize(network, initial, initial_noiseblend, content, styles, preserve_colors, iterations, content_weight, content_weight_blend, style_weight, style_layer_weight_exp, style_blend_weights, tv_weight, learning_rate, beta1, beta2, epsilon, pooling, print_iterations=None, checkpoint_iterations=None): """ Stylize images. This function yields tuples (iteration, image); `iteration` is None if this is the final image (the last iteration). Other tuples are yielded every `checkpoint_iterations` iterations. :rtype: iterator[tuple[int|None,image]] """ shape = (1, ) + content.shape style_shapes = [(1, ) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] vgg_weights, vgg_mean_pixel = vgg.load_net(network) layer_weight = 1.0 style_layers_weights = {} for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = layer_weight layer_weight *= style_layer_weight_exp # normalize style layer weights layer_weights_sum = 0 for style_layer in STYLE_LAYERS: layer_weights_sum += style_layers_weights[style_layer] for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] /= layer_weights_sum # compute content features in feedforward mode g = tf.Graph() config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = '0' with g.as_default(), g.device('/cpu:0'), tf.Session(config=config) as sess: image = tf.placeholder('float', shape=shape) net = vgg.net_preloaded(vgg_weights, image, pooling) content_pre = np.array([vgg.preprocess(content, vgg_mean_pixel)]) for layer in CONTENT_LAYERS: content_features[layer] = net[layer].eval( feed_dict={image: content_pre}) # compute style features in feedforward mode for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session( config=config) as sess: image = tf.placeholder('float', shape=style_shapes[i]) net = vgg.net_preloaded(vgg_weights, image, pooling) style_pre = np.array([vgg.preprocess(styles[i], vgg_mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[i][layer] = gram initial_content_noise_coeff = 1.0 - initial_noiseblend # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, vgg_mean_pixel)]) initial = initial.astype('float32') noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = (initial) * initial_content_noise_coeff + ( tf.random_normal(shape) * 0.256) * (1.0 - initial_content_noise_coeff) image = tf.Variable(initial) net = vgg.net_preloaded(vgg_weights, image, pooling) # content loss content_layers_weights = {} content_layers_weights['relu4_2'] = content_weight_blend content_layers_weights['relu5_2'] = 1.0 - content_weight_blend content_loss = 0 content_losses = [] for content_layer in CONTENT_LAYERS: ''' Compute the content loss Variables: content_weight: scalar constant we multiply the content_loss by. net[content_layer]: features of the current image, Tensor with shape [1, height, width, channels] content_features[content_layer]: features of the content image, Tensor with shape [1, height, width, channels] ''' # features of the current image [1, height, width, channels] l_content = content_weight * tf.reduce_sum( (net[content_layer] - content_features[content_layer])**2) content_losses.append(content_layers_weights[content_layer] * l_content) content_loss += reduce(tf.add, content_losses) # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, channels = map(lambda i: i.value, layer.get_shape()) size = height * width * channels ''' Compute the Gram matrix of the layer Variables: layer: features of the current image at style_layer, Tensor with shape [1, height, width, channels] gram: computed gram matrix with shape [channels, channels] ''' feats = tf.reshape(layer, (-1, channels)) gram = tf.matmul(tf.transpose(feats), feats) gram /= size ''' Compute the style loss Variables: style_layers_weights[style_layer]: scalar constant we multiply the content_loss by. gram: computed gram matrix with shape [channels, channels] style_gram: computed gram matrix of the style image at style_layer with shape [channels, channels] ''' style_gram = style_features[i][style_layer] l_style = style_layers_weights[style_layer] * tf.reduce_sum( (gram - style_gram)**2) style_losses.append(l_style) style_loss += style_weight * style_blend_weights[i] * reduce( tf.add, style_losses) # total variation denoising ''' Compute the TV loss Variables: tv_weight: scalar giving the weight to use for the TV loss. image: tensor of shape (1, H, W, 3) holding current image. ''' tv_loss = tv_weight * (tf.reduce_sum( (image[:, 1:, :, :] - image[:, :-1, :, :])**2) + tf.reduce_sum( (image[:, :, 1:, :] - image[:, :, :-1, :])**2)) # overall loss loss = content_loss + style_loss + tv_loss # optimizer setup train_step = tf.train.AdamOptimizer(learning_rate, beta1, beta2, epsilon).minimize(loss) def print_progress(): stderr.write(' content loss: %g\n' % content_loss.eval()) stderr.write(' style loss: %g\n' % style_loss.eval()) stderr.write(' tv loss: %g\n' % tv_loss.eval()) stderr.write(' total loss: %g\n' % loss.eval()) # optimization best_loss = float('inf') best = None with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) stderr.write('Optimization started...\n') if (print_iterations and print_iterations != 0): print_progress() for i in range(iterations): stderr.write('Iteration %4d/%4d\n' % (i + 1, iterations)) train_step.run() last_step = (i == iterations - 1) if last_step or (print_iterations and i % print_iterations == 0): print_progress() if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() img_out = vgg.unprocess(best.reshape(shape[1:]), vgg_mean_pixel) if preserve_colors and preserve_colors == True: original_image = np.clip(content, 0, 255) styled_image = np.clip(img_out, 0, 255) # Luminosity transfer steps: # 1. Convert stylized RGB->grayscale accoriding to Rec.601 luma (0.299, 0.587, 0.114) # 2. Convert stylized grayscale into YUV (YCbCr) # 3. Convert original image into YUV (YCbCr) # 4. Recombine (stylizedYUV.Y, originalYUV.U, originalYUV.V) # 5. Convert recombined image from YUV back to RGB # 1 styled_grayscale = rgb2gray(styled_image) styled_grayscale_rgb = gray2rgb(styled_grayscale) # 2 styled_grayscale_yuv = np.array( Image.fromarray( styled_grayscale_rgb.astype( np.uint8)).convert('YCbCr')) # 3 original_yuv = np.array( Image.fromarray(original_image.astype( np.uint8)).convert('YCbCr')) # 4 w, h, _ = original_image.shape combined_yuv = np.empty((w, h, 3), dtype=np.uint8) combined_yuv[..., 0] = styled_grayscale_yuv[..., 0] combined_yuv[..., 1] = original_yuv[..., 1] combined_yuv[..., 2] = original_yuv[..., 2] # 5 img_out = np.array( Image.fromarray(combined_yuv, 'YCbCr').convert('RGB')) yield ((None if last_step else i), img_out)
def stylize(network, initial, initial_noiseblend, content, styles, preserve_colors, iterations, content_weight, content_weight_blend, style_weight, style_layer_weight_exp, style_blend_weights, tv_weight, learning_rate, beta1, beta2, epsilon, pooling, print_iterations=None, checkpoint_iterations=None): """ Stylize images. This function yields tuples (iteration, image); `iteration` is None if this is the final image (the last iteration). Other tuples are yielded every `checkpoint_iterations` iterations. :rtype: iterator[tuple[int|None,image]] """ shape = (1,) + content.shape style_shapes = [(1,) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] # LOAD WEIGHTS AND AVG PIXEL HERE layer_weight = 1.0 style_layers_weights = {} for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = layer_weight layer_weight *= style_layer_weight_exp # normalize style layer weights layer_weights_sum = 0 for style_layer in STYLE_LAYERS: layer_weights_sum += style_layers_weights[style_layer] for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] /= layer_weights_sum # compute content features in feedforward mode # SET UP GRAPH # RUN SESSION ON CPU # IMAGE PLACEHOLDER # LOAD VGG content_pre = np.array([vgg.preprocess(content, vgg_mean_pixel)]) for layer in CONTENT_LAYERS: # evaluate features content_features[layer] = net[layer].eval(feed_dict={image: content_pre}) # compute style features in feedforward mode for i in range(len(styles)): # CREATE GRAPH FOR EVERY STYLE # RUN SESSION ON CPU # IMAGE PLACEHOLDER # LOAD VGG style_pre = np.array([vgg.preprocess(styles[i], vgg_mean_pixel)]) for layer in STYLE_LAYERS: # evaluate features features = net[layer].eval(feed_dict={image: style_pre}) # create gram matrix features = np.reshape(features, (-1, features.shape[3])) # CREATE GRAM MATRIX style_features[i][layer] = gram initial_content_noise_coeff = 1.0 - initial_noiseblend # make stylized image using backpropagation # USE DEFAULT GRAPH if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, vgg_mean_pixel)]) initial = initial.astype('float32') noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = (initial) * initial_content_noise_coeff + (tf.random_normal(shape) * 0.256) * (1.0 - initial_content_noise_coeff) # CREATE IMAGE VARIABLE FROM INITIAL # LOAD NET # content loss content_layers_weights = {} content_layers_weights['relu4_2'] = content_weight_blend content_layers_weights['relu5_2'] = 1.0 - content_weight_blend content_loss = 0 content_losses = [] for content_layer in CONTENT_LAYERS: content_losses.append(content_layers_weights[content_layer] * content_weight * (2 * tf.nn.l2_loss( net[content_layer] - content_features[content_layer]) / content_features[content_layer].size)) # ADD CONTENT LOSSES FOR EACH LAYER TOGETHER - USE REDUCE # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) # CREATE GRAM MATRIX style_gram = style_features[i][style_layer] style_losses.append(style_layers_weights[style_layer] * 2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weights[i] * reduce(tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:,1:,:,:]) tv_x_size = _tensor_size(image[:,:,1:,:]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:,1:,:,:] - image[:,:shape[1]-1,:,:]) / tv_y_size) + (tf.nn.l2_loss(image[:,:,1:,:] - image[:,:,:shape[2]-1,:]) / tv_x_size)) # overall loss # LOSS FUNCTION LINE HERE # optimizer setup # TF.TRAIN LINE HERE - USE ADAM def print_progress(): stderr.write(' content loss: %g\n' % content_loss.eval()) stderr.write(' style loss: %g\n' % style_loss.eval()) stderr.write(' tv loss: %g\n' % tv_loss.eval()) stderr.write(' total loss: %g\n' % loss.eval()) # optimization BACKPROP best_loss = float('inf') best = None # START SESSION # INITIALIZE VARIABLES TO BEGIN OPTIMIZATION stderr.write('Optimization started...\n') if (print_iterations and print_iterations != 0): print_progress() iteration_times = [] start = time.time() for i in range(iterations): iteration_start = time.time() if i > 0: elapsed = time.time() - start # take average of last couple steps to get time per iteration remaining = np.mean(iteration_times[-10:]) * (iterations - i) stderr.write('Iteration %4d/%4d (%s elapsed, %s remaining)\n' % ( i + 1, iterations, hms(elapsed), hms(remaining) )) else: stderr.write('Iteration %4d/%4d\n' % (i + 1, iterations)) train_step.run() last_step = (i == iterations - 1) if last_step or (print_iterations and i % print_iterations == 0): print_progress() # backprop - replacing loss if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: # CHECK LOSS HERE img_out = vgg.unprocess(best.reshape(shape[1:]), vgg_mean_pixel) if preserve_colors and preserve_colors == True: original_image = np.clip(content, 0, 255) styled_image = np.clip(img_out, 0, 255) # Luminosity transfer steps: # 1. Convert stylized RGB->grayscale accoriding to Rec.601 luma (0.299, 0.587, 0.114) # 2. Convert stylized grayscale into YUV (YCbCr) # 3. Convert original image into YUV (YCbCr) # 4. Recombine (stylizedYUV.Y, originalYUV.U, originalYUV.V) # 5. Convert recombined image from YUV back to RGB # 1 styled_grayscale = rgb2gray(styled_image) styled_grayscale_rgb = gray2rgb(styled_grayscale) # 2 styled_grayscale_yuv = np.array(Image.fromarray(styled_grayscale_rgb.astype(np.uint8)).convert('YCbCr')) # 3 original_yuv = np.array(Image.fromarray(original_image.astype(np.uint8)).convert('YCbCr')) # 4 w, h, _ = original_image.shape combined_yuv = np.empty((w, h, 3), dtype=np.uint8) combined_yuv[..., 0] = styled_grayscale_yuv[..., 0] combined_yuv[..., 1] = original_yuv[..., 1] combined_yuv[..., 2] = original_yuv[..., 2] # 5 img_out = np.array(Image.fromarray(combined_yuv, 'YCbCr').convert('RGB')) yield ( (None if last_step else i), img_out ) iteration_end = time.time() iteration_times.append(iteration_end - iteration_start)
def stylize(network, initial, content, styles, iterations, content_weight, style_weight, style_blend_weights, tv_weight, learning_rate, print_iterations=None, checkpoint_iterations=None): # input.shape = (n_image, height, width, channel) content_shape = (1, ) + content.shape # style.shape = [ all style shapes ] style_shapes = [(1, ) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] # for multiple style image inputs # compute content features g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: content_pl = tf.placeholder('float', shape=content_shape) # compute feedforward activations # net is the activation of image using Placeholder activation, mean_pixel = vgg.net(network, content_pl) # preprocessing the input content_preprocessed = np.array([vgg.preprocess(content, mean_pixel)]) # extract content features using preprocessed input into the VGG # we only extract content features from one layer content_features[CONTENT_LAYER] = activation[CONTENT_LAYER].eval( feed_dict={content_pl: content_preprocessed}) # compute style features # the loop below is for multiple style image inputs for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: # since different style layers have differnt shapes style_pl = tf.placeholder('float', shape=style_shapes[i]) # question: why we use the mean value from content_features activation, _ = vgg.net(network, style_pl) style_preprocessed = np.array([vgg.preprocess(styles[i], mean_pixel)]) # since we will compute multiple layers of styles, we use loop for layer in STYLE_LAYERS: # extract the one of the style features from one layer _features = activation[layer].eval(feed_dict={style_pl: style_preprocessed}) # since we will compute the Gram Matrix, we will reshape the output # so that the inner product is easier to compute # question why should we reshape? what is the origal shape? what does -1 mean _features = _features.reshape((-1, _features.shape[3])) # compute the Gram Matrix as style features # why divide it by _features.size gram = np.matmul(_features.T, _features) / _features.size style_features[i][layer] = gram # the first index is the n_th style image input # compute back-prop with tf.Graph().as_default(): # initial = None means this iteration is our first iteration # thus we need to generate a white noise image if initial is None: # the noise turned out to be not used at all white_noise_image = np.random.normal(size=content_shape, scale=np.std(content) * .1) initial = tf.random_normal(content_shape) * .256 # if we already have an image in training # we will keep using this image for further modification else: initial_preprocessed = np.array([vgg.preprocess(initial, mean_pixel)]) initial = initial_preprocessed.astype('float32') # we make this initial input as a trainable variable image = tf.Variable(initial) activation, _ = vgg.net(network, image) # compute content loss image_content_features = activation[CONTENT_LAYER] target_content_features = content_features[CONTENT_LAYER] # why divide it by target.size, can we eliminate that? # the content weight is included here rather than the end content_loss = content_weight * .5 * 1 / target_content_features.size * tf.nn.l2_loss(image_content_features - target_content_features) # compute style loss # using loop to sum style loss for multiple style image inputs style_loss_for_all_styles = 0 for i in range(len(styles)): style_losses = [] # the total losses # using loop to sum style loss for multiple style layers for style_layer in STYLE_LAYERS: layer_activation = activation[style_layer] _, height, width, channel = map( lambda i: i.value, layer_activation.get_shape()) layer_size = height * width * channel feats = tf.reshape(layer_activation, (-1, channel)) # it doesn't have to divide by size image_style_gram = tf.matmul(tf.transpose(feats), feats) / layer_size target_style_gram = style_features[i][style_layer] layer_style_loss = 2 / target_style_gram.size * tf.nn.l2_loss(image_style_gram - target_style_gram) style_losses.append(layer_style_loss) style_loss_for_all_styles += style_weight * style_blend_weights[i] * reduce(tf.add, style_losses) # total variation denoising # this loss is added to regularize that # the output image will not deviate too much # from content image at each pixel tv_y_size = _tensor_size(image[:, 1:, :, :]) tv_x_size = _tensor_size(image[:, :, 1:, :]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:, 1:, :, :] - image[:, :content_shape[1] - 1, :, :]) / tv_y_size) + (tf.nn.l2_loss(image[:, :, 1:, :] - image[:, :, :content_shape[2] - 1, :]) / tv_x_size)) # overall loss loss = content_loss + style_loss_for_all_styles + tv_loss # optimizer train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) def print_progress(i, last=False): stderr.write('Iteration %d/%d' % (i + 1, iterations)) if last or (print_iterations and i % print_iterations == 0): stderr.write(' content loss: %g\n' % content_loss.eval()) stderr.write(' style loss: %g\n' % style_loss_for_all_styles.eval()) stderr.write(' tv loss: %g\n' % tv_loss.eval()) stderr.write(' total loss: %g\n' % loss.eval()) # optimization best_loss = float('inf') # all losses will be lower than initial best = None total_initial_time = datetime.now().replace(microsecond=0) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) initial_time = datetime.now().replace(microsecond=0) for i in range(iterations): now_time = datetime.now().replace(microsecond=0) last_step = (i == iterations - 1) print_progress(i, last=last_step) stderr.write(' Training Time %s Elapsed Time %s\n' % (str(now_time - initial_time), str(now_time - total_initial_time))) initial_time = now_time train_step.run() # when checkpoint_iterations is not None # and when iter idx fulfills it # or when it comes to the last step if(checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss # image was a tf.Variable # eval it turns it into numbers, I guess best = image.eval() # what is yield? # content_shape[1:] means # shape[1], shape[2], shape[3] # but eliminate the shape[0] which # means the number of images yield( (None if last_step else i), (vgg.unprocess(best.reshape(content_shape[1:]), mean_pixel)))
def train(content_targets, style_target, content_weight, style_weight, tv_weight, vgg_path, epochs=2, print_iterations=1000, batch_size=4, learning_rate=1e-3, save_path='model/style.ckpt'): # 根据batch丢弃最后的训练图像 mod = len(content_targets) % batch_size if mod > 0: content_targets = content_targets[:-mod] style_features = {} # 训练图像大小:320x320x3,按照tensorflow格式 batch_shape = (batch_size, 320, 320, 3) style_shape = (1, ) + style_target.shape # 读取训练好的VGGNet模型 weights, mean_pixel = vgg.load_net(vgg_path) with tf.Graph().as_default(), tf.Session() as sess: style_image = tf.placeholder(tf.float32, shape=style_shape, name='style_image') # 没看错!空图片减去均值 style_image_pre = vgg.preprocess(style_image, mean_pixel) net = vgg.net(weights, style_image_pre) # 把style图片展开形成数组 style_pre = np.array([style_target]) for layer in STYLE_LAYER: # 取出该层的计算结果 features = net[layer].eval(feed_dict={style_image: style_pre}) # 行数为该层的Filter数(参见论文) features = np.reshape(features, (-1, features.shape[3])) # Gram Matrix: A'A (参见论文) gram = np.matmul(features.T, features) / features.size style_features[layer] = gram with tf.Graph().as_default(), tf.Session() as sess: x_content = tf.placeholder(tf.float32, shape=batch_shape, name='x_content') x_pre = vgg.preprocess(x_content, mean_pixel) content_features = {} content_net = vgg.net(weights, x_pre) # 同上,提取所需层 content_features[CONTENT_LAYER] = content_net[CONTENT_LAYER] # 使用残差网络 preds = residual.net(x_content / 255.0) preds_pre = vgg.preprocess(preds, mean_pixel) net = vgg.net(weights, preds_pre) # 计算每个batch里的所有数据 content_size = _tensor_size( content_features[CONTENT_LAYER]) * batch_size assert _tensor_size(content_features[CONTENT_LAYER]) == _tensor_size( net[CONTENT_LAYER]) # 计算经过残差网络和不经过时的差别 content_loss = content_weight * ( 2 * tf.nn.l2_loss(net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_size) # 计算经过残差网络的图像与style图像之间的差别 style_losses = [] for style_layer in STYLE_LAYER: layer = net[style_layer] bs, height, width, filters = map(lambda i: i.value, layer.get_shape()) size = height * width * filters feats = tf.reshape(layer, (bs, height * width, filters)) feats_T = tf.transpose(feats, perm=[0, 2, 1]) # Gram Matrix: A'A (参见论文) grams = tf.matmul(feats_T, feats) / size style_gram = style_features[style_layer] style_losses.append(2 * tf.nn.l2_loss(grams - style_gram) / style_gram.size) style_loss = style_weight * functools.reduce(tf.add, style_losses) / batch_size # 去图像噪声: Total Variation tv_y_size = _tensor_size(preds[:, 1:, :, :]) tv_x_size = _tensor_size(preds[:, :, 1:, :]) y_tv = tf.nn.l2_loss(preds[:, 1:, :, :] - preds[:, :batch_shape[1] - 1, :, :]) x_tv = tf.nn.l2_loss(preds[:, :, 1:, :] - preds[:, :, :batch_shape[2] - 1, :]) tv_loss = tv_weight * 2 * (x_tv / tv_x_size + y_tv / tv_y_size) / batch_size # 最终的loss函数 loss = content_loss + style_loss + tv_loss # 开始训练过程 train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) sess.run(tf.global_variables_initializer()) for epoch in range(epochs): num_examples = len(content_targets) iterations = 0 start_time = time.time() # 每一次epoch就用训练集的所有图片训练一遍 while iterations * batch_size < num_examples: curr = iterations * batch_size step = curr + batch_size X_batch = np.zeros(batch_shape, dtype=np.float32) for j, img_p in enumerate(content_targets[curr:step]): X_batch[j] = helper.read_img(img_p, (320, 320, 3)).astype( np.float32) iterations += 1 # 确保每批次计算的时候不出错 assert X_batch.shape[0] == batch_size feed_dict = {x_content: X_batch} # 开始训练 train_step.run(feed_dict=feed_dict) # 隔几次打印一次训练进度 is_print_iter = int(iterations) % print_iterations == 0 # 是否是最后一个epoch is_last = epoch == epochs - 1 and iterations * batch_size >= num_examples # 打印信息 should_print = is_print_iter or is_last if should_print: current_time = time.time() delta_time = current_time - start_time start_time = current_time to_get = [style_loss, content_loss, tv_loss, loss, preds] test_feed_dict = {x_content: X_batch} tup = sess.run(to_get, feed_dict=test_feed_dict) _style_loss, _content_loss, _tv_loss, _loss, _preds = tup losses = (_style_loss, _content_loss, _tv_loss, _loss) saver = tf.train.Saver() res = saver.save(sess, save_path) yield (_preds, losses, iterations, epoch, delta_time)
def optimize(content_targets, style_target, content_weight, style_weight, tv_weight, vgg_path, epochs=2, print_iterations=1000, batch_size=4, checkpoint_dir='saver/fns.ckpt', summary_dir='summary/', learning_rate=1e-3): """ Calculate the total loss and optimize the network. Args: content_targets: The content image. style_target: The style image. content_weight: Weight for content loss. style_weight: Weight for style loss. tv_weight: Weight for total vaiance. vgg_path: Path of the vgg network. epochs: Number of epochs for training. Default: 2. print_iteration: Print the trainging loss. Default: 1000 batch_size: Default: 4. checkpoint_dir: Path to save the checkpoint. summary_dir: Path to save summaries. learning_rate: Default: 1e-3. Returns: Yield the prediction, losses, iteration and epoch """ mod = len(content_targets) % batch_size if mod > 0: print("Train set has been trimmed slightly..") content_targets = content_targets[:-mod] # discard the remaining batch_shape = (batch_size, 256, 256, 3) # precompute style features style_features = _style_features(style_target, vgg_path) X_content = tf.placeholder(tf.float32, shape=batch_shape, name="X_content") X_pre = vgg.preprocess(X_content) # compute content features content_features = {} content_net = vgg.net(vgg_path, X_pre) content_features[CONTENT_LAYER] = content_net[CONTENT_LAYER] # content is the input for both the transform network and the loss # network preds = transform.net(X_content/255.0) preds_pre = vgg.preprocess(preds) net = vgg.net(vgg_path, preds_pre) # compute loss content_loss = _content_loss(content_weight, net, content_features, batch_size) style_loss = _style_loss(style_weight, net, style_features) tv_loss = _tv_loss(tv_weight, preds, batch_shape) loss = content_loss + style_loss + tv_loss train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) # summary for tensorboard tf.summary.scalar("content loss", content_loss) tf.summary.scalar("style loss", style_loss) tf.summary.scalar("tv loss", tv_loss) tf.summary.scalar("total loss", loss) summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(summary_dir, graph=tf.get_default_graph()) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(epochs): num_examples = len(content_targets) iterations = 0 while iterations * batch_size < num_examples: curr = iterations * batch_size step = curr + batch_size X_batch = np.zeros(batch_shape, dtype=np.float32) for j, img_p in enumerate(content_targets[curr:step]): X_batch[j] = get_img(img_p, (256, 256, 3)).astype(np.float32) #resize content image iterations += 1 assert X_batch.shape[0] == batch_size feed_dict = { X_content : X_batch } #train_step.run(feed_dict = feed_dict) summary, _ = sess.run([summary_op, train_step], feed_dict = feed_dict) is_print_iter = int(iterations) % print_iterations == 0 is_last = epoch == epochs - 1 and iterations * batch_size >= num_examples should_print = is_print_iter or is_last if should_print: to_get = [style_loss, content_loss, tv_loss, loss, preds] test_feed_dict = {X_content:X_batch} tup = sess.run(to_get, feed_dict = test_feed_dict) style_loss_p, content_loss_p, tv_loss_p,loss_p, preds_p = tup losses = (style_loss_p, content_loss_p, tv_loss_p, loss_p) saver = tf.train.Saver(max_to_keep = 5) res = saver.save(sess, checkpoint_dir, iterations) yield(preds_p, losses, iterations, epoch) if int(iterations) % 20 == 0: writer.add_summary(summary)
def stylize(network, initial, initial_noiseblend, content, styles, preserve_colors, iterations, content_weight, content_weight_blend, style_weight, style_layer_weight_exp, style_blend_weights, tv_weight, learning_rate, beta1, beta2, epsilon, pooling, print_iterations=None, checkpoint_iterations=None): """ Stylize images. This function yields tuples (iteration, image); `iteration` is None if this is the final image (the last iteration). Other tuples are yielded every `checkpoint_iterations` iterations. :rtype: iterator[tuple[int|None,image]] """ shape = (1,) + content.shape style_shapes = [(1,) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] vgg_weights, vgg_mean_pixel = vgg.load_net(network) layer_weight = 1.0 style_layers_weights = {} for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = layer_weight layer_weight *= style_layer_weight_exp # normalize style layer weights layer_weights_sum = 0 for style_layer in STYLE_LAYERS: layer_weights_sum += style_layers_weights[style_layer] for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] /= layer_weights_sum # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net = vgg.net_preloaded(vgg_weights, image, pooling) content_pre = np.array([vgg.preprocess(content, vgg_mean_pixel)]) for layer in CONTENT_LAYERS: content_features[layer] = net[layer].eval(feed_dict={image: content_pre}) # compute style features in feedforward mode for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shapes[i]) net = vgg.net_preloaded(vgg_weights, image, pooling) style_pre = np.array([vgg.preprocess(styles[i], vgg_mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features_bank = sk_image.extract_patches_2d(np.squeeze(features), (kernel_s, kernel_s)) style_features[i][layer] = [features_bank,features] initial_content_noise_coeff = 1.0 - initial_noiseblend # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, vgg_mean_pixel)]) initial = initial.astype('float32') # noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = (initial) * initial_content_noise_coeff + (tf.random_normal(shape) * 0.256) * (1.0 - initial_content_noise_coeff) image = tf.Variable(initial) net = vgg.net_preloaded(vgg_weights, image, pooling) # content loss content_layers_weights = {} # content_layers_weights['relu4_2'] = content_weight_blend # content_layers_weights['relu5_2'] = 1.0 - content_weight_blend content_layers_weights['relu4_2'] = 0.5 content_layers_weights['relu5_2'] = 0.5 content_loss = 0 content_losses = [] for content_layer in CONTENT_LAYERS: content_losses.append(content_layers_weights[content_layer] * content_weight * (2 * tf.nn.l2_loss( net[content_layer] - content_features[content_layer]) / content_features[content_layer].size)) content_loss += reduce(tf.add, content_losses) # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: # Calculate normalized layer layer = tf.expand_dims(net[style_layer],axis=4) paddings = [[0, 0], [1,1], [1,1], [0, 0],[0,0]] layer_depth = layer.get_shape().as_list()[3] layer_pad = tf.pad(layer, paddings, "CONSTANT") layer_norm = tf.sqrt(tf.nn.conv3d(tf.pow(layer_pad,2),tf.ones((kernel_s,kernel_s,layer_depth,1,1),dtype=tf.float32),strides=[1, 1, 1, 1, 1],padding='VALID')) # Calculate normalized filter bank style_filters = np.transpose(style_features[i][style_layer][0],(1,2,3,0)) style_filters = np.expand_dims(style_filters,axis=3) style_filters_norm = np.sqrt(np.sum(np.power(style_filters,2),axis=(0,1,2))) style_filters_normalized = style_filters/style_filters_norm # Calculate normalized correlations layer_filtered = tf.nn.conv3d(layer_pad,style_filters_normalized,strides=[1, 1, 1, 1, 1],padding='VALID')/layer_norm # Find maximum response and index into the filters max_filter_response_idx = tf.squeeze(tf.argmax(layer_filtered,axis=4)) # max_filter_response_idx = tf.squeeze(tf.argmax(tf.abs(layer_filtered),axis=4)) max_filter_response_idx = tf.reshape(max_filter_response_idx,[-1]) max_filter_response_weight = tf.squeeze(tf.reduce_max(tf.abs(layer_filtered),axis=4)) max_filter_response_weight = tf.reshape(max_filter_response_weight,[-1]) max_filter_response_weight = max_filter_response_weight/tf.reduce_max(max_filter_response_weight) style_filters_tf = tf.transpose(tf.squeeze(tf.convert_to_tensor(style_filters, np.float32)),(3,0,1,2)) style_filters_tf_gathered = tf.gather(style_filters_tf,max_filter_response_idx) style_filters_tf_gathered = tf.reshape(style_filters_tf_gathered,(style_filters_tf_gathered.get_shape().as_list()[0], -1)) layer_patches = tf.extract_image_patches(tf.squeeze(layer_pad,axis=4), [1,kernel_s,kernel_s,1], [1,1,1,1], [1,1,1,1], padding="VALID") layer_size = tf.shape(layer_patches) layer_patches = tf.reshape(layer_patches,(-1, layer_size[3])) style_norm = tf.cast(layer_size[1]*layer_size[2]*layer_size[3],dtype=tf.float32) # gram1 = tf.matmul(tf.transpose(layer_patches), layer_patches) / style_norm # gram2 = tf.matmul(tf.transpose(style_filters_tf_gathered), style_filters_tf_gathered) / style_norm # style_losses.append(style_layers_weights[style_layer] * 2 * tf.nn.l2_loss(gram1- gram2)) loss_ = tf.reduce_mean(tf.reduce_mean(tf.pow(layer_patches-style_filters_tf_gathered, 2),axis=1)*tf.stop_gradient(max_filter_response_weight)) style_losses.append(style_layers_weights[style_layer] * 2 * loss_) style_loss += style_weight * style_blend_weights[i] * reduce(tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:,1:,:,:]) tv_x_size = _tensor_size(image[:,:,1:,:]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:,1:,:,:] - image[:,:shape[1]-1,:,:]) / tv_y_size) + (tf.nn.l2_loss(image[:,:,1:,:] - image[:,:,:shape[2]-1,:]) / tv_x_size)) # overall loss loss = content_loss + style_loss + tv_loss # optimizer setup train_step = tf.train.AdamOptimizer(learning_rate, beta1, beta2, epsilon).minimize(loss) def print_progress(): stderr.write(' content loss: %g\n' % content_loss.eval()) stderr.write(' style loss: %g\n' % style_loss.eval()) stderr.write(' tv loss: %g\n' % tv_loss.eval()) stderr.write(' total loss: %g\n' % loss.eval()) # optimization best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.global_variables_initializer()) stderr.write('Optimization started...\n') if (print_iterations and print_iterations != 0): print_progress() for i in range(iterations): stderr.write('Iteration %4d/%4d\n' % (i + 1, iterations)) # print(str(max_filter_response_weight.eval())) # print(' ') train_step.run() last_step = (i == iterations - 1) if last_step or (print_iterations and i % print_iterations == 0): print_progress() if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() img_out = vgg.unprocess(best.reshape(shape[1:]), vgg_mean_pixel) if preserve_colors and preserve_colors == True: original_image = np.clip(content, 0, 255) styled_image = np.clip(img_out, 0, 255) # Luminosity transfer steps: # 1. Convert stylized RGB->grayscale accoriding to Rec.601 luma (0.299, 0.587, 0.114) # 2. Convert stylized grayscale into YUV (YCbCr) # 3. Convert original image into YUV (YCbCr) # 4. Recombine (stylizedYUV.Y, originalYUV.U, originalYUV.V) # 5. Convert recombined image from YUV back to RGB # 1 styled_grayscale = rgb2gray(styled_image) styled_grayscale_rgb = gray2rgb(styled_grayscale) # 2 styled_grayscale_yuv = np.array(Image.fromarray(styled_grayscale_rgb.astype(np.uint8)).convert('YCbCr')) # 3 original_yuv = np.array(Image.fromarray(original_image.astype(np.uint8)).convert('YCbCr')) # 4 w, h, _ = original_image.shape combined_yuv = np.empty((w, h, 3), dtype=np.uint8) combined_yuv[..., 0] = styled_grayscale_yuv[..., 0] combined_yuv[..., 1] = original_yuv[..., 1] combined_yuv[..., 2] = original_yuv[..., 2] # 5 img_out = np.array(Image.fromarray(combined_yuv, 'YCbCr').convert('RGB')) yield ( (None if last_step else i), img_out )
G_sample) # G_sample = generated dataset (fake) # D_real & D_fake = unused (D_fake = probability G fools D) """ Feature Loss """ #VGG #content = imread('abbeyexample_copy.png')/256 #content = gray2rgb(rgb2gray(content)) shape = (1, 256, 256, 3) pooling = 'avg' CONTENT_LAYERS = ('relu4_2', 'relu5_2') network = 'imagenet-vgg-verydeep-19.mat' vgg_weights, vgg_mean_pixel = vgg.load_net(network) print(5) orig_image = tf.placeholder( 'float', shape=shape) #need to feed it with (1,256,256,3) objects print(orig_image) orig_content = vgg.preprocess(orig_image, vgg_mean_pixel) #tensor (1,256,256,3) print(orig_content) print('G_sample.shape', G_sample.shape) G_sample_dim = G_sample G_sample = tf.reshape(G_sample, (256, 256)) G_sample = tf.stack([G_sample, G_sample, G_sample], axis=2) #tensor (256,256,3) print('G_sample.shape', G_sample.shape) gen_content = vgg.preprocess(G_sample, vgg_mean_pixel) gen_content = tf.expand_dims(gen_content, 0) print('ok') orig_net = vgg.net_preloaded(vgg_weights, orig_content, pooling) print('ok1') gen_net = vgg.net_preloaded(vgg_weights, gen_content, pooling) #content_pre = np.array([vgg.preprocess(content, vgg_mean_pixel)]) #print('content_pre.shape',content_pre.shape)
def optimize(content_targets, style_target, content_weight, style_weight, tv_weight, vgg_path, epochs=2, print_iterations=1000, batch_size=4, save_path='saver/fns.ckpt', slow=False, learning_rate=1e-3, debug=False): if slow: batch_size = 1 mod = len(content_targets) % batch_size if mod > 0: print("Train set has been trimmed slightly..") content_targets = content_targets[:-mod] style_features = {} batch_shape = (batch_size,256,256,3) style_shape = (1,) + style_target.shape print(style_shape) # precompute style features with tf.Graph().as_default(), tf.device('/cpu:0'), tf.Session() as sess: style_image = tf.placeholder(tf.float32, shape=style_shape, name='style_image') style_image_pre = vgg.preprocess(style_image) net = vgg.net(vgg_path, style_image_pre) style_pre = np.array([style_target]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={style_image:style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[layer] = gram with tf.Graph().as_default(), tf.Session() as sess: X_content = tf.placeholder(tf.float32, shape=batch_shape, name="X_content") X_pre = vgg.preprocess(X_content) # precompute content features content_features = {} content_net = vgg.net(vgg_path, X_pre) content_features[CONTENT_LAYER] = content_net[CONTENT_LAYER] if slow: preds = tf.Variable( tf.random_normal(X_content.get_shape()) * 0.256 ) preds_pre = preds else: preds = transform.net(X_content/255.0) preds_pre = vgg.preprocess(preds) net = vgg.net(vgg_path, preds_pre) content_size = _tensor_size(content_features[CONTENT_LAYER])*batch_size assert _tensor_size(content_features[CONTENT_LAYER]) == _tensor_size(net[CONTENT_LAYER]) content_loss = content_weight * (2 * tf.nn.l2_loss( net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_size ) style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] bs, height, width, filters = map(lambda i:i.value,layer.get_shape()) size = height * width * filters feats = tf.reshape(layer, (bs, height * width, filters)) feats_T = tf.transpose(feats, perm=[0,2,1]) grams = tf.matmul(feats_T, feats) / size style_gram = style_features[style_layer] style_losses.append(2 * tf.nn.l2_loss(grams - style_gram)/style_gram.size) style_loss = style_weight * functools.reduce(tf.add, style_losses) / batch_size # total variation denoising tv_y_size = _tensor_size(preds[:,1:,:,:]) tv_x_size = _tensor_size(preds[:,:,1:,:]) y_tv = tf.nn.l2_loss(preds[:,1:,:,:] - preds[:,:batch_shape[1]-1,:,:]) x_tv = tf.nn.l2_loss(preds[:,:,1:,:] - preds[:,:,:batch_shape[2]-1,:]) tv_loss = tv_weight*2*(x_tv/tv_x_size + y_tv/tv_y_size)/batch_size loss = content_loss + style_loss + tv_loss # overall loss train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) sess.run(tf.global_variables_initializer()) import random uid = random.randint(1, 100) print("UID: %s" % uid) for epoch in range(epochs): num_examples = len(content_targets) iterations = 0 while iterations * batch_size < num_examples: start_time = time.time() curr = iterations * batch_size step = curr + batch_size X_batch = np.zeros(batch_shape, dtype=np.float32) for j, img_p in enumerate(content_targets[curr:step]): X_batch[j] = get_img(img_p, (256,256,3)).astype(np.float32) iterations += 1 assert X_batch.shape[0] == batch_size feed_dict = { X_content:X_batch } train_step.run(feed_dict=feed_dict) end_time = time.time() delta_time = end_time - start_time if debug: print("UID: %s, batch time: %s" % (uid, delta_time)) is_print_iter = int(iterations) % print_iterations == 0 if slow: is_print_iter = epoch % print_iterations == 0 is_last = epoch == epochs - 1 and iterations * batch_size >= num_examples should_print = is_print_iter or is_last if should_print: to_get = [style_loss, content_loss, tv_loss, loss, preds] test_feed_dict = { X_content:X_batch } tup = sess.run(to_get, feed_dict = test_feed_dict) _style_loss,_content_loss,_tv_loss,_loss,_preds = tup losses = (_style_loss, _content_loss, _tv_loss, _loss) if slow: _preds = vgg.unprocess(_preds) else: saver = tf.train.Saver() res = saver.save(sess, save_path) yield(_preds, losses, iterations, epoch)
def stylize(network, initial, initial_noiseblend, content, styles, preserve_colors, iterations, content_weight, content_weight_blend, style_weight, style_layer_weight_exp, style_blend_weights, tv_weight, learning_rate, beta1, beta2, epsilon, pooling, print_iterations=None, checkpoint_iterations=None): """ Stylize images. This function yields tuples (iteration, image); `iteration` is None if this is the final image (the last iteration). Other tuples are yielded every `checkpoint_iterations` iterations. :rtype: iterator[tuple[int|None,image]] """ shape = (1,) + content.shape #若content.shape=(356, 600, 3) shape=(1,356, 600, 3) style_shapes = [(1,) + style.shape for style in styles] content_features = {} #创建内容features map style_features = [{} for _ in styles] #创建风格features map vgg_weights, vgg_mean_pixel = vgg.load_net(network) #加载预训练模型,得到weights和mean_pixel layer_weight = 1.0 style_layers_weights = {} for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = layer_weight layer_weight *= style_layer_weight_exp #若有设置style_layer_weight_exp,则style_layers_weights指数增长, # style_layer_weight_exp默认为1不增长 # normalize style layer weights layer_weights_sum = 0 for style_layer in STYLE_LAYERS: layer_weights_sum += style_layers_weights[style_layer] for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] /= layer_weights_sum #更新style_layers_weights应该是比例,使其总和为1 # 首先创建一个image的占位符,然后通过eval()的feed_dict将content_pre传给image, # 启动net的运算过程,得到了content的feature maps # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.compat.v1.Session() as sess: #计算content features image = tf.compat.v1.placeholder('float', shape=shape) net = vgg.net_preloaded(vgg_weights, image, pooling) #所有网络在此构建,net为content的features maps content_pre = np.array([vgg.preprocess(content, vgg_mean_pixel)]) #content - vgg_mean_pixel for layer in CONTENT_LAYERS: content_features[layer] = net[layer].eval(feed_dict={image: content_pre}) #content_features取值 # print(layer,content_features[layer].shape) # compute style features in feedforward mode for i in range(len(styles)): #计算style features g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.compat.v1.Session() as sess: image = tf.compat.v1.placeholder('float', shape=style_shapes[i]) net = vgg.net_preloaded(vgg_weights, image, pooling) #pooling 默认为MAX style_pre = np.array([vgg.preprocess(styles[i], vgg_mean_pixel)]) #styles[i]-vgg_mean_pixel for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) #根据通道数目reshape gram = np.matmul(features.T, features) / features.size #gram矩阵 style_features[i][layer] = gram initial_content_noise_coeff = 1.0 - initial_noiseblend # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 #初始化图片 else: initial = np.array([vgg.preprocess(initial, vgg_mean_pixel)]) initial = initial.astype('float32') noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = (initial) * initial_content_noise_coeff + ( tf.random.normal(shape) * 0.256) * (1.0 - initial_content_noise_coeff) image = tf.Variable(initial) ''' image = tf.Variable(initial)初始化了一个TensorFlow的变量,即为我们需要训练的对象。 注意这里我们训练的对象是一张图像,而不是weight和bias。 ''' net = vgg.net_preloaded(vgg_weights, image, pooling) #此处的net为生成图片的features map # content loss content_layers_weights = {} content_layers_weights['relu4_2'] = content_weight_blend #内容图片 content weight blend, conv4_2 * blend + conv5_2 * (1-blend) content_layers_weights['relu5_2'] = 1.0 - content_weight_blend #content weight blend默认为1,即只用conv4_2层 content_loss = 0 content_losses = [] for content_layer in CONTENT_LAYERS: content_losses.append(content_layers_weights[content_layer] * content_weight * (2 * tf.nn.l2_loss( net[content_layer] - content_features[content_layer]) / #生成图片-内容图片 content_features[content_layer].size)) # tf.nn.l2_loss:output = sum(t ** 2) / 2 content_loss += reduce(tf.add, content_losses) # style loss style_loss = 0 ''' 由于style图像可以输入多幅,这里使用for循环。同样的,将style_pre传给image占位符, 启动net运算,得到了style的feature maps,由于style为不同filter响应的内积, 因此在这里增加了一步:gram = np.matmul(features.T, features) / features.size,即为style的feature。 ''' for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / size #求得生成图片的gram矩阵 style_gram = style_features[i][style_layer] style_losses.append(style_layers_weights[style_layer] * 2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weights[i] * reduce(tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:,1:,:,:]) tv_x_size = _tensor_size(image[:,:,1:,:]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:,1:,:,:] - image[:,:shape[1]-1,:,:]) / tv_y_size) + (tf.nn.l2_loss(image[:,:,1:,:] - image[:,:,:shape[2]-1,:]) / tv_x_size)) # overall loss ''' 接下来定义了Content Loss和Style Loss,结合文中的公式很容易看懂,在代码中, 还增加了total variation denoising,因此总的loss = content_loss + style_loss + tv_loss ''' loss = content_loss + style_loss + tv_loss #总loss为三个loss之和 # optimizer setup # optimizer setup # 创建train_step,使用Adam优化器,优化对象是上面的loss # 优化过程,通过迭代使用train_step来最小化loss,最终得到一个best,即为训练优化的结果 train_step = tf.compat.v1.train.AdamOptimizer(learning_rate, beta1, beta2, epsilon).minimize(loss) def print_progress(): stderr.write(' content loss: %g\n' % content_loss.eval()) stderr.write(' style loss: %g\n' % style_loss.eval()) stderr.write(' tv loss: %g\n' % tv_loss.eval()) stderr.write(' total loss: %g\n' % loss.eval()) # optimization best_loss = float('inf') best = None with tf.compat.v1.Session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) stderr.write('Optimization started...\n') if (print_iterations and print_iterations != 0): print_progress() for i in range(iterations): stderr.write('Iteration %4d/%4d\n' % (i + 1, iterations)) train_step.run() last_step = (i == iterations - 1) if last_step or (print_iterations and i % print_iterations == 0): print_progress() if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() img_out = vgg.unprocess(best.reshape(shape[1:]), vgg_mean_pixel) #还原图片 if preserve_colors and preserve_colors == True: original_image = np.clip(content, 0, 255) styled_image = np.clip(img_out, 0, 255) # Luminosity transfer steps: # 1. Convert stylized RGB->grayscale accoriding to Rec.601 luma (0.299, 0.587, 0.114) # 2. Convert stylized grayscale into YUV (YCbCr) # 3. Convert original image into YUV (YCbCr) # 4. Recombine (stylizedYUV.Y, originalYUV.U, originalYUV.V) # 5. Convert recombined image from YUV back to RGB # 1 styled_grayscale = rgb2gray(styled_image) styled_grayscale_rgb = gray2rgb(styled_grayscale) # 2 styled_grayscale_yuv = np.array(Image.fromarray(styled_grayscale_rgb.astype(np.uint8)).convert('YCbCr')) # 3 original_yuv = np.array(Image.fromarray(original_image.astype(np.uint8)).convert('YCbCr')) # 4 w, h, _ = original_image.shape combined_yuv = np.empty((w, h, 3), dtype=np.uint8) combined_yuv[..., 0] = styled_grayscale_yuv[..., 0] combined_yuv[..., 1] = original_yuv[..., 1] combined_yuv[..., 2] = original_yuv[..., 2] # 5 img_out = np.array(Image.fromarray(combined_yuv, 'YCbCr').convert('RGB')) yield ( #相当于return,但用于迭代 (None if last_step else i), img_out )
def main(): content_path, style_path, width, style_scale = sys.argv[1:] width = int(width) style_scale = float(style_scale) content_image = imread(content_path) style_image = imread(style_path) if width > 0: new_shape = (int(math.floor(float(content_image.shape[0]) / content_image.shape[1] * width)), width) content_image = sm.imresize(content_image, new_shape) if style_scale > 0: style_image = sm.imresize(style_image, style_scale) shape = (1,) + content_image.shape style_shape = (1,) + style_image.shape content_features = {} style_features = {} g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net, mean_pixel = vgg.net(VGG_PATH, image) content_pre = np.array([vgg.preprocess(content_image, mean_pixel)]) content_features[CONTENT_LAYER] = net[CONTENT_LAYER].eval( feed_dict={image: content_pre}) g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shape) net, _ = vgg.net(VGG_PATH, image) style_pre = np.array([vgg.preprocess(style_image, mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) grammatrix = np.matmul(features.T, features) style_features[layer] = grammatrix g = tf.Graph() with g.as_default(): global_step = tf.Variable(0, trainable=False) noise = np.random.normal(size=shape, scale=np.std(content_image) * 0.1) content_pre = vgg.preprocess(content_image, mean_pixel) init = content_pre * (1 - NOISE_RATIO) + noise * NOISE_RATIO init = init.astype('float32') image = tf.Variable(init) net, _ = vgg.net(VGG_PATH, image) content_loss = tf.nn.l2_loss( net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) style_losses = [] for i in STYLE_LAYERS: layer = net[i] _, height, width, number = map(lambda i: i.value, layer.get_shape()) feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) style_gram = style_features[i] style_losses.append(tf.nn.l2_loss(gram - style_gram) / (4.0 * number ** 2 * (height * width) ** 2)) style_loss = reduce(tf.add, style_losses) / len(style_losses) loss = ALPHA * content_loss + BETA * style_loss learning_rate = tf.train.exponential_decay(LEARNING_RATE_INITIAL, global_step, LEARNING_DECAY_STEPS, LEARNING_DECAY_BASE, staircase=True) train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for i in range(100000): print 'i = %d' % i imsave('%05d.jpg' % i, vgg.unprocess( image.eval().reshape(shape[1:]), mean_pixel)) train_step.run()
VGG_PATH) # convert the tf.nn.conv2d to slim format of vgg decoder = Decoder(mode='test', weights_path=DECODER_PATH) content_input = tf.placeholder(tf.float32, shape=(1, None, None, 3), name='content_input') style_input = tf.placeholder(tf.float32, shape=(1, None, None, 3), name='style_input') # switch RGB to BGR content = tf.reverse(content_input, axis=[-1]) style = tf.reverse(style_input, axis=[-1]) # preprocess image content = vgg.preprocess(content) style = vgg.preprocess(style) encoder_content, encoder_content_points = vgg.vgg_19( content, reuse=False, final_endpoint="conv4_1") encoder_style, encoder_style_points = vgg.vgg_19( style, reuse=True, final_endpoint="conv4_1") # pass the encoded images to AdaIN target_features = AdaIN(encoder_content, encoder_style) # decode target features back to image with tf.variable_scope("decoder_target"): #alpha = 0.8 #target_features=(1-alpha)*encoder_content+alpha*target_features #content-style trade-off generated_img = decoder.decode(target_features)
def stylize(network, initial, initial_noiseblend, content, styles, preserve_colors, iterations, content_weight, content_weight_blend, style_weight, style_layer_weight_exp, style_blend_weights, tv_weight, learning_rate, beta1, beta2, epsilon, pooling, print_iterations=None, checkpoint_iterations=None): """ Stylize images. This function yields tuples (iteration, image); `iteration` is None if this is the final image (the last iteration). Otherwise tuples are yielded every `checkpoint_iterations` iterations. :rtype: iterator[tuple[int|None,image]] """ # The shape information in the comment is based on the content image 1-content.jpg with shape (533, 400, 3) # and 1-style.jpg (316, 400, 3) # This should be changed with different images. shape = (1,) + content.shape # (1, 533, 400, 3) style_shapes = [(1,) + style.shape for style in styles] # (1, 316, 400, 3) content_features = {} style_features = [{} for _ in styles] vgg_weights, vgg_mean_pixel = vgg.load_net(network) # Load the VGG-19 model. layer_weight = 1.0 style_layers_weights = {} for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = layer_weight # {'relu1_1': 1.0, 'relu2_1': 1.0, 'relu3_1': 1.0, 'relu4_1': 1.0, 'relu5_1': 1.0} layer_weight *= style_layer_weight_exp # 1.0 # VGG19 layers: # 'conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', # 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2', # 'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'conv3_4', 'relu3_4', 'pool3', # 'conv4_1', 'relu4_1', 'conv4_2', 'relu4_2', 'conv4_3', 'relu4_3', 'conv4_4', 'relu4_4', 'pool4', # 'conv5_1', 'relu5_1', 'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3', 'conv5_4', 'relu5_4' # normalize style layer weights layer_weights_sum = 0 for style_layer in STYLE_LAYERS: # ('relu1_1', 'relu2_1', 'relu3_1', 'relu4_1', 'relu5_1') layer_weights_sum += style_layers_weights[style_layer] # 5.0 for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] /= layer_weights_sum # {'relu1_1': 0.2, 'relu2_1': 0.2, 'relu3_1': 0.2, 'relu4_1': 0.2, 'relu5_1': 0.2} # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net = vgg.net_preloaded(vgg_weights, image, pooling) # {'conv1_1': Tensor..., relu1_1: Tensor...} content_pre = np.array([vgg.preprocess(content, vgg_mean_pixel)]) # (1, 533, 400, 3) subtract with the mean pixel for layer in CONTENT_LAYERS: # (relu4_2, relu5_2) content_features[layer] = net[layer].eval(feed_dict={image: content_pre}) # Find the feature values for (relu4_2, relu5_2) # compute style features in feed forward mode for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shapes[i]) # (1, 316, 400, 3) net = vgg.net_preloaded(vgg_weights, image, pooling) style_pre = np.array([vgg.preprocess(styles[i], vgg_mean_pixel)]) for layer in STYLE_LAYERS: # # ('relu1_1', 'relu2_1', 'relu3_1', 'relu4_1', 'relu5_1') features = net[layer].eval(feed_dict={image: style_pre}) # For relu1_1 layer (1, 316, 400, 64) features = np.reshape(features, (-1, features.shape[3])) # (126400, 64) gram = np.matmul(features.T, features) / features.size # (64, 64) Gram matrix - measure the dependency of features. style_features[i][layer] = gram initial_content_noise_coeff = 1.0 - initial_noiseblend # 0 # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) # Generate a random image with SD the same as the content image. initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, vgg_mean_pixel)]) initial = initial.astype('float32') noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = (initial) * initial_content_noise_coeff + (tf.random_normal(shape) * 0.256) * (1.0 - initial_content_noise_coeff) image = tf.Variable(initial) net = vgg.net_preloaded(vgg_weights, image, pooling) # content loss content_layers_weights = {} content_layers_weights['relu4_2'] = content_weight_blend content_layers_weights['relu5_2'] = 1.0 - content_weight_blend content_loss = 0 content_losses = [] for content_layer in CONTENT_LAYERS: # {'relu5_2'} # Use MSE as content losses content_losses.append(content_layers_weights[content_layer] * content_weight * (2 * tf.nn.l2_loss( net[content_layer] - content_features[content_layer]) / content_features[content_layer].size)) content_loss += reduce(tf.add, content_losses) # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] # For relu1_1: (1, 533, 400, 64) _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) # (213200, 64) gram = tf.matmul(tf.transpose(feats), feats) / size # Gram matrix for the features in relu1_1 for the result image. style_gram = style_features[i][style_layer] # Gram matrix for the style # Style loss is the MSE for the difference of the 2 Gram matrix style_losses.append(style_layers_weights[style_layer] * 2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weights[i] * reduce(tf.add, style_losses) # Total variation denoising: Add cost to penalize neighboring pixel is very different. # This help to reduce noise. tv_y_size = _tensor_size(image[:,1:,:,:]) tv_x_size = _tensor_size(image[:,:,1:,:]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:,1:,:,:] - image[:,:shape[1]-1,:,:]) / tv_y_size) + (tf.nn.l2_loss(image[:,:,1:,:] - image[:,:,:shape[2]-1,:]) / tv_x_size)) # overall loss loss = content_loss + style_loss + tv_loss # optimizer setup train_step = tf.train.AdamOptimizer(learning_rate, beta1, beta2, epsilon).minimize(loss) def print_progress(): stderr.write(' content loss: %g\n' % content_loss.eval()) stderr.write(' style loss: %g\n' % style_loss.eval()) stderr.write(' tv loss: %g\n' % tv_loss.eval()) stderr.write(' total loss: %g\n' % loss.eval()) # optimization best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.global_variables_initializer()) stderr.write('Optimization started...\n') if (print_iterations and print_iterations != 0): print_progress() for i in range(iterations): stderr.write('Iteration %4d/%4d\n' % (i + 1, iterations)) train_step.run() last_step = (i == iterations - 1) if last_step or (print_iterations and i % print_iterations == 0): print_progress() if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() img_out = vgg.unprocess(best.reshape(shape[1:]), vgg_mean_pixel) if preserve_colors and preserve_colors == True: original_image = np.clip(content, 0, 255) styled_image = np.clip(img_out, 0, 255) # Luminosity transfer steps: # 1. Convert stylized RGB->grayscale accoriding to Rec.601 luma (0.299, 0.587, 0.114) # 2. Convert stylized grayscale into YUV (YCbCr) # 3. Convert original image into YUV (YCbCr) # 4. Recombine (stylizedYUV.Y, originalYUV.U, originalYUV.V) # 5. Convert recombined image from YUV back to RGB # 1 styled_grayscale = rgb2gray(styled_image) styled_grayscale_rgb = gray2rgb(styled_grayscale) # 2 styled_grayscale_yuv = np.array(Image.fromarray(styled_grayscale_rgb.astype(np.uint8)).convert('YCbCr')) # 3 original_yuv = np.array(Image.fromarray(original_image.astype(np.uint8)).convert('YCbCr')) # 4 w, h, _ = original_image.shape combined_yuv = np.empty((w, h, 3), dtype=np.uint8) combined_yuv[..., 0] = styled_grayscale_yuv[..., 0] combined_yuv[..., 1] = original_yuv[..., 1] combined_yuv[..., 2] = original_yuv[..., 2] # 5 img_out = np.array(Image.fromarray(combined_yuv, 'YCbCr').convert('RGB')) yield ( (None if last_step else i), img_out )
def stylize(network, initial, content, styles, iterations, content_weight, style_weight, style_blend_weights, tv_weight, learning_rate, print_iterations=None, checkpoint_iterations=None): shape = (1,) + content.shape style_shapes = [(1,) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net, mean_pixel = vgg.net(network, image) content_pre = np.array([vgg.preprocess(content, mean_pixel)]) content_features[CONTENT_LAYER] = net[CONTENT_LAYER].eval( feed_dict={image: content_pre}) # compute style features in feedforward mode for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shapes[i]) net, _ = vgg.net(network, image) style_pre = np.array([vgg.preprocess(styles[i], mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) print 'Initial feature shape: ', features.shape features = np.reshape(features, (-1, features.shape[3])) #mask = np.zeros_like(features) #mask[:49664/2, :] = 1 #print 'Mask shape', mask.shape print 'Final features shape', features.shape #features = features*mask gram = np.matmul(features.T, features) / features.size print 'Gram matrix shape: ', gram.shape style_features[i][layer] = gram #sys.exit() # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, mean_pixel)]) initial = initial.astype('float32') image = tf.Variable(initial) net, _ = vgg.net(network, image) # content loss content_loss = content_weight * (2 * tf.nn.l2_loss( net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_features[CONTENT_LAYER].size) # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) print 'Height, width, number', height, width, number size = height * width * number feats = tf.reshape(layer, (-1, number)) #print tf.shape(feats).as_list() if normal_flag == 0: mask = np.zeros((height*width, number), dtype=np.float32) maskt = np.reshape(imread('bottle_mask.jpg').astype(np.float32), (height*width,)) maskt = maskt > 100 for d in xrange(number): mask[:,d] = maskt print 'Mask shape', mask.shape #print sum(sum(mask == 1)) + sum(sum(mask == 0)) #mask[:height*width/2, :] = 1 if i == 0: mask = tf.constant(mask) feats = tf.mul(feats,mask) gram = tf.matmul(tf.transpose(feats), feats) / size style_gram = style_features[i][style_layer] style_losses.append(2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) else: mask2 = mask < 1 feats2 = tf.mul(feats,mask2) gram2 = tf.matmul(tf.transpose(feats2), feats2) / size style_gram = style_features[i][style_layer] style_losses.append(2 * tf.nn.l2_loss(gram2 - style_gram) / style_gram.size) else: feats2 = feats gram2 = tf.matmul(tf.transpose(feats2), feats2) / size style_gram = style_features[i][style_layer] style_losses.append(2 * tf.nn.l2_loss(gram2 - style_gram) / style_gram.size) pass style_loss += style_weight * style_blend_weights[i] * reduce(tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:,1:,:,:]) tv_x_size = _tensor_size(image[:,:,1:,:]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:,1:,:,:] - image[:,:shape[1]-1,:,:]) / tv_y_size) + (tf.nn.l2_loss(image[:,:,1:,:] - image[:,:,:shape[2]-1,:]) / tv_x_size)) # overall loss loss = content_loss + style_loss + tv_loss if normal_flag != 0: print "general mask :" mask = np.zeros((height*width, number), dtype=np.float32) maskt = np.reshape(imread('bottle_mask.jpg').astype(np.float32), (height*width,)) maskt = maskt > 100 # for d in xrange(3): # mask[:,d] = maskt print 'Mask shape', maskt.shape maskt = maskt.reshape((height,width)) maskt = np.array([maskt,maskt,maskt]) maskt = maskt.transpose((1,2,0)) mask = tf.constant(maskt, dtype=tf.float32) # feats = tf.mul(feats,mask) def capper(a,b,mask): # (1, 468, 304, 3) print "orig shape", a reshaped_in_grad = tf.reshape(a,[-1] ) print "reshaped grad", reshaped_in_grad print "mask" ,mask g = tf.mul(a,mask) # g = tf.reshape(g, (1,height,width,3)) # print a,b # print g return g,b # optimizer setup # train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) # # Create an optimizer. train_step = tf.train.GradientDescentOptimizer(learning_rate) # # Compute the gradients for a list of variables. grads_and_vars = train_step.compute_gradients(loss) # # grads_and_vars is a list of tuples (gradient, variable). Do whatever you # # need to the 'gradient' part, for example cap them, etc. capped_grads_and_vars = [(capper(gv[0], gv[1], mask)) for gv in grads_and_vars] # # Ask the optimizer to apply the capped gradients. train_step = train_step.apply_gradients(capped_grads_and_vars) # opt_op = opt.minimize(cost, var_list=<list of variables>) def print_progress(i, last=False): if print_iterations is not None: if i is not None and i % print_iterations == 0 or last: print >> stderr, ' content loss: %g' % content_loss.eval() print >> stderr, ' style loss: %g' % style_loss.eval() print >> stderr, ' tv loss: %g' % tv_loss.eval() print >> stderr, ' total loss: %g' % loss.eval() # optimization best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for i in range(iterations): print_progress(i) print >> stderr, 'Iteration %d/%d' % (i + 1, iterations) train_step.run() # print "runningstep: ",i, running_step if (checkpoint_iterations is not None and i % checkpoint_iterations == 0) or i == iterations - 1: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() print_progress(None, i == iterations - 1) if i % 10 == 0 and best is not None: tmp_img = vgg.unprocess(best.reshape(shape[1:]), mean_pixel) imsave("iter" + str(i) + ".jpg", tmp_img) return vgg.unprocess(best.reshape(shape[1:]), mean_pixel)
def stylize(network, initial, initial_noiseblend, content, styles, preserve_colors, iterations, content_weight, content_weight_blend, style_weight, style_layer_weight_exp, style_blend_weights, tv_weight, learning_rate, beta1, beta2, epsilon, pooling, print_iterations=None, checkpoint_iterations=None): """ Stylize images. This function yields tuples (iteration, image, loss_vals) at every iteration. However `image` and `loss_vals` are None by default. Each `checkpoint_iterations`, `image` is not None. Each `print_iterations`, `loss_vals` is not None. `loss_vals` is a dict with loss values for the current iteration, e.g. ``{'content': 1.23, 'style': 4.56, 'tv': 7.89, 'total': 13.68}``. :rtype: iterator[tuple[int,image]] """ shape = (1, ) + content.shape style_shapes = [(1, ) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] vgg_weights, vgg_mean_pixel = vgg.load_net(network) layer_weight = 1.0 style_layers_weights = {} for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = layer_weight layer_weight *= style_layer_weight_exp # normalize style layer weights layer_weights_sum = 0 for style_layer in STYLE_LAYERS: layer_weights_sum += style_layers_weights[style_layer] for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] /= layer_weights_sum # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net = vgg.net_preloaded(vgg_weights, image, pooling) content_pre = np.array([vgg.preprocess(content, vgg_mean_pixel)]) for layer in CONTENT_LAYERS: content_features[layer] = net[layer].eval( feed_dict={image: content_pre}) # compute style features in feedforward mode for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shapes[i]) net = vgg.net_preloaded(vgg_weights, image, pooling) style_pre = np.array([vgg.preprocess(styles[i], vgg_mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[i][layer] = gram initial_content_noise_coeff = 1.0 - initial_noiseblend # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, vgg_mean_pixel)]) initial = initial.astype('float32') noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = (initial) * initial_content_noise_coeff + ( tf.random_normal(shape) * 0.256) * (1.0 - initial_content_noise_coeff) image = tf.Variable(initial) net = vgg.net_preloaded(vgg_weights, image, pooling) # content loss content_layers_weights = {} content_layers_weights['relu4_2'] = content_weight_blend content_layers_weights['relu5_2'] = 1.0 - content_weight_blend content_loss = 0 content_losses = [] for content_layer in CONTENT_LAYERS: content_losses.append( content_layers_weights[content_layer] * content_weight * (2 * tf.nn.l2_loss(net[content_layer] - content_features[content_layer]) / content_features[content_layer].size)) content_loss += reduce(tf.add, content_losses) # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / size style_gram = style_features[i][style_layer] style_losses.append(style_layers_weights[style_layer] * 2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weights[i] * reduce( tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:, 1:, :, :]) tv_x_size = _tensor_size(image[:, :, 1:, :]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:, 1:, :, :] - image[:, :shape[1] - 1, :, :]) / tv_y_size) + (tf.nn.l2_loss(image[:, :, 1:, :] - image[:, :, :shape[2] - 1, :]) / tv_x_size)) # total loss loss = content_loss + style_loss + tv_loss # We use OrderedDict to make sure we have the same order of loss types # (content, tv, style, total) as defined by the initial costruction of # the loss_store dict. This is important for print_progress() and # saving loss_arrs (column order) in the main script. # # Subtle Gotcha (tested with Python 3.5): The syntax # OrderedDict(key1=val1, key2=val2, ...) does /not/ create the same # order since, apparently, it first creates a normal dict with random # order (< Python 3.7) and then wraps that in an OrderedDict. We have # to pass in a data structure which is already ordered. I'd call this a # bug, since both constructor syntax variants result in different # objects. In 3.6, the order is preserved in dict() in CPython, in 3.7 # they finally made it part of the language spec. Thank you! loss_store = OrderedDict([('content', content_loss), ('style', style_loss), ('tv', tv_loss), ('total', loss)]) # optimizer setup train_step = tf.train.AdamOptimizer(learning_rate, beta1, beta2, epsilon).minimize(loss) # optimization best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print('Optimization started...') if (print_iterations and print_iterations != 0): print_progress(get_loss_vals(loss_store)) iteration_times = [] start = time.time() for i in range(iterations): iteration_start = time.time() if i > 0: elapsed = time.time() - start # take average of last couple steps to get time per iteration remaining = np.mean( iteration_times[-10:]) * (iterations - i) print('Iteration %4d/%4d (%s elapsed, %s remaining)' % (i + 1, iterations, hms(elapsed), hms(remaining))) else: print('Iteration %4d/%4d' % (i + 1, iterations)) train_step.run() last_step = (i == iterations - 1) if last_step or (print_iterations and i % print_iterations == 0): loss_vals = get_loss_vals(loss_store) print_progress(loss_vals) else: loss_vals = None if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() img_out = vgg.unprocess(best.reshape(shape[1:]), vgg_mean_pixel) if preserve_colors and preserve_colors == True: original_image = np.clip(content, 0, 255) styled_image = np.clip(img_out, 0, 255) # Luminosity transfer steps: # 1. Convert stylized RGB->grayscale accoriding to Rec.601 luma (0.299, 0.587, 0.114) # 2. Convert stylized grayscale into YUV (YCbCr) # 3. Convert original image into YUV (YCbCr) # 4. Recombine (stylizedYUV.Y, originalYUV.U, originalYUV.V) # 5. Convert recombined image from YUV back to RGB # 1 styled_grayscale = rgb2gray(styled_image) styled_grayscale_rgb = gray2rgb(styled_grayscale) # 2 styled_grayscale_yuv = np.array( Image.fromarray( styled_grayscale_rgb.astype( np.uint8)).convert('YCbCr')) # 3 original_yuv = np.array( Image.fromarray(original_image.astype( np.uint8)).convert('YCbCr')) # 4 w, h, _ = original_image.shape combined_yuv = np.empty((w, h, 3), dtype=np.uint8) combined_yuv[..., 0] = styled_grayscale_yuv[..., 0] combined_yuv[..., 1] = original_yuv[..., 1] combined_yuv[..., 2] = original_yuv[..., 2] # 5 img_out = np.array( Image.fromarray(combined_yuv, 'YCbCr').convert('RGB')) if iterations % 2 == 0: print(iterations) print('dang o day') Image.fromarray(img).save('output' + str(iterations) + '.jpg', quality=95) else: img_out = None yield i + 1 if last_step else i, img_out, loss_vals iteration_end = time.time() iteration_times.append(iteration_end - iteration_start)
def inferenceImg(network, initial_img, initial_noiseblend, content, style, preserve_colors, iterations, content_weight, content_weight_blend, style_weight, style_layer_weight_exp, style_blend_weight, tv_weight, learning_rate, beta1, beta2, epsilon, pooling, print_iterations, checkpoint_iterations): content_shape = (1, ) + content.shape style_shape = (1, ) + style.shape content_features = {} style_features = {} vgg_weights, vgg_mean_pixel = vgg.load_net(network) layer_weight = 1.0 style_layers_weights = {} for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = layer_weight layer_weight = layer_weight * style_layer_weight_exp # normalize style layer weights layer_weights_sum = 0 for style_layer in STYLE_LAYERS: layer_weights_sum = layer_weights_sum + style_layers_weights[ style_layer] for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = style_layers_weights[ style_layer] / layer_weights_sum # compute content features in feedforward mode g1 = tf.Graph() with g1.as_default(), g1.device('/cpu:0'), tf.Session() as sess: contentImg = tf.placeholder('float', shape=content_shape) net = vgg.net_preloaded(vgg_weights, contentImg, pooling) content_pre = np.array([vgg.preprocess(content, vgg_mean_pixel)]) for layer in CONTENT_LAYERS: content_features[layer] = net[layer].eval( feed_dict={contentImg: content_pre}) # compute style features in feedforward mode g2 = tf.Graph() with g2.as_default(), g2.device('/cpu:0'), tf.Session() as sess: styleImg = tf.placeholder('float', shape=style_shape) net = vgg.net_preloaded(vgg_weights, styleImg, pooling) style_pre = np.array([vgg.preprocess(style, vgg_mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={styleImg: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[layer] = gram initial_content_noise_coeff = 1.0 - initial_noiseblend # make stylized image using backpropogation with tf.Graph().as_default(): noise = np.random.normal(size=content_shape, scale=np.std(content) * 0.1) initial = tf.random_normal(content_shape) * 0.256 inferenceImg = tf.Variable(initial) net = vgg.net_preloaded(vgg_weights, inferenceImg, pooling) # compute content loss content_layers_weights = {} content_layers_weights['relu4_2'] = content_weight_blend content_layers_weights['relu5_2'] = 1.0 - content_weight_blend content_loss = 0 content_losses = [] for content_layer in CONTENT_LAYERS: content_losses.append( content_layers_weights[content_layer] * content_weight * (2 * tf.nn.l2_loss(net[content_layer] - content_features[content_layer]) / content_features[content_layer].size)) content_loss += reduce(tf.add, content_losses) # compute style loss style_loss = 0 style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / size style_gram = style_features[style_layer] style_losses.append(style_layers_weights[style_layer] * 2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weight * reduce( tf.add, style_losses) # skip compute variation denoise, in order to shorten the running time # total variation denoising # tv_y_size = _tensor_size(inferenceImg[:, 1:, :, :]) # tv_x_size = _tensor_size(inferenceImg[:, :, 1:, :]) # tv_loss = tv_weight * 2 * ( # (tf.nn.l2_loss(inferenceImg[:, 1:, :, :] - inferenceImg[:, :content_shape[1] - 1, :, :]) / # tv_y_size) + # (tf.nn.l2_loss(inferenceImg[:, :, 1:, :] - inferenceImg[:, :, :content_shape[2] - 1, :]) / # tv_x_size)) tv_loss = 0 # overall loss loss = content_loss + style_loss + tv_loss # optimizer training train_step = tf.train.AdamOptimizer(learning_rate, beta1, beta2, epsilon).minimize(loss) def print_progress(): stderr.write(' content loss: %g\n' % content_loss.eval()) stderr.write(' style loss: %g\n' % style_loss.eval()) stderr.write(' total loss: %g\n' % loss.eval()) best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.global_variables_initializer()) stderr.write('Optimization started...\n') if (print_iterations and print_iterations != 0): print_progress() for i in range(iterations): train_step.run() last_step = (i == iterations - 1) if last_step or (print_iterations and i % print_iterations == 0): stderr.write('Iteration %4d/%4d\n' % (i + 1, iterations)) print_progress() if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = inferenceImg.eval() img_out = vgg.unprocess(best.reshape(content_shape[1:]), vgg_mean_pixel) if preserve_colors and preserve_colors == True: original_image = np.clip(content, 0, 255) styled_image = np.clip(img_out, 0, 255) # Luminosity transfer steps: # 1. Convert stylized RGB->grayscale accoriding to Rec.601 luma (0.299, 0.587, 0.114) # 2. Convert stylized grayscale into YUV (YCbCr) # 3. Convert original image into YUV (YCbCr) # 4. Recombine (stylizedYUV.Y, originalYUV.U, originalYUV.V) # 5. Convert recombined image from YUV back to RGB # 1 styled_grayscale = rgb2gray(styled_image) styled_grayscale_rgb = gray2rgb(styled_grayscale) # 2 styled_grayscale_yuv = np.array( Image.fromarray( styled_grayscale_rgb.astype( np.uint8)).convert('YCbCr')) # 3 original_yuv = np.array( Image.fromarray(original_image.astype( np.uint8)).convert('YCbCr')) # 4 w, h, _ = original_image.shape combined_yuv = np.empty((w, h, 3), dtype=np.uint8) combined_yuv[..., 0] = styled_grayscale_yuv[..., 0] combined_yuv[..., 1] = original_yuv[..., 1] combined_yuv[..., 2] = original_yuv[..., 2] # 5 img_out = np.array( Image.fromarray(combined_yuv, 'YCbCr').convert('RGB')) yield ((None if last_step else i), img_out)
def optimize(content_targets, style_target, content_weight, style_weight, tv_weight, vgg_path): mod = len(content_targets) % batch_size if (mod > 0): print("Train set has been trimmed slightly..") content_targets = content_targets[:-mod] batch_shape = (batch_size, 256, 256, 3) style_shape = (1, *style_target.shape) print('batch shape:', batch_shape) print('style shape:', style_shape) with tf.Graph().as_default(), tf.Session() as sess: # Declare placeholders we'll feed into the graph style_image = tf.placeholder(tf.float32, shape=style_shape, name='style_image') X_content = tf.placeholder(tf.float32, shape=batch_shape, name='X_content') # Precompute content features start_time = time.time() content_features = {} X_content_pre = vgg.preprocess(X_content) content_net = vgg.net(vgg_path, X_content_pre) content_features[CONTENT_LAYER] = content_net[CONTENT_LAYER] end_time = time.time() delta_time = end_time - start_time print('precompute content features time:', delta_time) # Precompute style features start_time = time.time() style_features = {} style_pre = np.array([style_target]) # feed style_image_pre = vgg.preprocess(style_image) style_net = vgg.net(vgg_path, style_image_pre) for layer in STYLE_LAYERS: features = style_net[layer].eval( feed_dict={style_image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[layer] = gram end_time = time.time() delta_time = end_time - start_time print('precompute style features time:', delta_time) # Build prediction net preds = transform.net(X_content / 255.0) preds_pre = vgg.preprocess(preds) preds_net = vgg.net(vgg_path, preds_pre) # Compute content loss ? start_time = time.time() content_size = _tensor_size( content_features[CONTENT_LAYER]) * batch_size assert _tensor_size(content_features[CONTENT_LAYER]) == _tensor_size( preds_net[CONTENT_LAYER]) content_loss = content_weight * ( 2 * tf.nn.l2_loss(preds_net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_size) end_time = time.time() delta_time = end_time - start_time print('compute content loss time:', delta_time) # Compute style loss ? start_time = time.time() style_losses = [] for style_layer in STYLE_LAYERS: layer = preds_net[style_layer] bs, height, width, filters = map(lambda i: i.value, layer.get_shape()) size = height * width * filters feats = tf.reshape(layer, (bs, height * width, filters)) feats_T = tf.transpose(feats, perm=[0, 2, 1]) grams = tf.matmul(feats_T, feats) / size style_gram = style_features[style_layer] style_losses.append(2 * tf.nn.l2_loss(grams - style_gram) / style_gram.size) style_loss = style_weight * functools.reduce(tf.add, style_losses) / batch_size end_time = time.time() delta_time = end_time - start_time print('compute style loss time:', delta_time) # Total variation denoising ? start_time = time.time() tv_y_size = _tensor_size(preds[:, 1:, :, :]) tv_x_size = _tensor_size(preds[:, :, 1:, :]) y_tv = tf.nn.l2_loss(preds[:, 1:, :, :] - preds[:, :batch_shape[1] - 1, :, :]) x_tv = tf.nn.l2_loss(preds[:, :, 1:, :] - preds[:, :, :batch_shape[2] - 1, :]) tv_loss = tv_weight * 2 * (x_tv / tv_x_size + y_tv / tv_y_size) / batch_size end_time = time.time() delta_time = end_time - start_time print('total variation denoising time:', delta_time) # Overall loss start_time = time.time() all_loss = content_loss + style_loss + tv_loss end_time = time.time() delta_time = end_time - start_time print('compute overall loss time:', delta_time) # Build train train = tf.train.AdamOptimizer(learning_rate).minimize(all_loss) sess.run(tf.global_variables_initializer()) print('Start training...') start_time = time.time() num_examples = len(content_targets) n_batches = num_examples // batch_size iterations = n_batches * epochs # For writing training checkpoints. saver = tf.train.Saver() for epoch in range(epochs): for batch in range(n_batches): iteration = epoch * n_batches + batch + 1 # curr = iteration * batch_size # step = curr + batch_size curr = batch * batch_size step = curr + batch_size X_batch = np.zeros(batch_shape, dtype=np.float32) # feed for i, img_p in enumerate(content_targets[curr:step]): X_batch[i] = get_img(img_p, (256, 256, 3)).astype(np.float32) assert X_batch.shape[0] == batch_size sess.run(train, feed_dict={X_content: X_batch}) to_get = [style_loss, content_loss, tv_loss, all_loss, preds] if (iteration % display_every_n == 0): tup = sess.run(to_get, feed_dict={X_content: X_batch}) _style_loss, _content_loss, _tv_loss, _all_loss, _preds = tup losses = (_style_loss, _content_loss, _tv_loss, _all_loss) print( 'Iteration {}/{} - style loss: {:.4f}, content loss: {:.4f}, tv loss: {:.4f}, all loss: {:.4f}' .format(iteration, iterations, *losses)) if (iteration % save_every_n == 0) or (iteration == iterations): _all_loss = sess.run(all_loss, feed_dict={X_content: X_batch}) ckpt = saver.save( sess, os.path.join(FLAGS.checkpoint_dir, "ckpt_i{}".format(iteration))) print('Epoch {}/{}, Iteration: {}/{}, loss: {}'.format( epoch, epochs, iteration, iterations, _all_loss)) yield (epoch, iteration, ckpt) end_time = time.time() delta_time = end_time - start_time print('Done! Train total time:', delta_time)
def stylize(content, style, initial, initial_noiseblend, content_weight=5e0, content_layer_num=9, style_weight=5e2, style_layer_weight=(0.2, 0.2, 0.2, 0.2, 0.2), tv_weight=1e2, learning_rate=1e1, beta1=0.9, beta2=0.999, epsilon=1e-8, preserve_colors=False, pooling='max', iterations=1000, print_iterations=None, checkpoint_iterations=None): """ Stylize images. This function yields tuples (iteration, image); `iteration` is None if this is the final image (the last iteration). Other tuples are yielded every `checkpoint_iterations` iterations. :rtype: iterator[tuple[int|None,image]] """ shape = (1, ) + content.shape content_features = {} style_features = {} style_layers_weights = {} content_layer = CONTENT_LAYERS[content_layer_num] for i, style_layer in enumerate(STYLE_LAYERS): style_layers_weights[style_layer] = style_layer_weight[i] vgg_weights, vgg_mean_pixel = vgg.load_net(network) image = tf.placeholder(tf.float32, shape=shape) net = vgg.net_preloaded(vgg_weights, image, pooling) content_pre = np.array([vgg.preprocess(content, vgg_mean_pixel)]) style_pre = np.array([vgg.preprocess(style, vgg_mean_pixel)]) # compute content features,style features in feedforward mode with tf.Session() as sess: content_features[content_layer] = sess.run( net[content_layer], feed_dict={image: content_pre}) for layer in STYLE_LAYERS: features = sess.run(net[layer], feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[layer] = gram # make stylized image using backpropogation if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, vgg_mean_pixel)]) initial = initial.astype(np.float32) noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = initial * (1 - initial_noiseblend) + ( tf.random_normal(shape) * 0.256) * initial_noiseblend image = tf.Variable(initial) net = vgg.net_preloaded(vgg_weights, image, pooling) # content loss content_loss = content_weight * 2 * tf.nn.l2_loss( net[content_layer] - content_features[content_layer]) / content_features[content_layer].size # style loss style_loss = 0 for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / size style_gram = style_features[style_layer] style_loss += style_weight * style_layers_weights[ style_layer] * 2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size # total variation denoising tv_y_size = _tensor_size(image[:, 1:, :, :]) tv_x_size = _tensor_size(image[:, :, 1:, :]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:, 1:, :, :] - image[:, :shape[1] - 1, :, :]) / tv_y_size) + (tf.nn.l2_loss(image[:, :, 1:, :] - image[:, :, :shape[2] - 1, :]) / tv_x_size)) # overall loss loss = content_loss + style_loss + tv_loss # optimizer setup train_step = tf.train.AdamOptimizer(learning_rate, beta1, beta2, epsilon).minimize(loss) def print_progress(): print(' content loss: %g\n' % content_loss.eval()) print(' style loss: %g\n' % style_loss.eval()) print(' tv loss: %g\n' % tv_loss.eval()) print(' total loss: %g\n' % loss.eval()) # optimization best_loss = float('inf') best = None images = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print('Optimization started...\n') if (print_iterations and print_iterations != 0): print_progress() for i in range(iterations): train_step.run() last_step = (i == iterations - 1) if last_step or (print_iterations and i % print_iterations == 0): print('Iteration %4d/%4d\n' % (i + 1, iterations)) print_progress() if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() styled_image = np.clip( vgg.unprocess(image.eval().reshape(shape[1:]), vgg_mean_pixel), 0, 255) if this_loss < best_loss: best_loss = this_loss best = styled_image if preserve_colors and preserve_colors == True: original_image = np.clip(content, 0, 255) # Luminosity transfer steps: # 1. Convert stylized RGB->grayscale accoriding to Rec.601 luma (0.299, 0.587, 0.114) # 2. Convert stylized grayscale into YUV (YCbCr) # 3. Convert original image into YUV (YCbCr) # 4. Recombine (stylizedYUV.Y, originalYUV.U, originalYUV.V) # 5. Convert recombined image from YUV back to RGB # 1 styled_grayscale = rgb2gray(styled_image) styled_grayscale_rgb = gray2rgb(styled_grayscale) # 2 styled_grayscale_yuv = np.array( Image.fromarray(styled_grayscale_rgb.astype( np.uint8)).convert('YCbCr')) # 3 original_yuv = np.array( Image.fromarray(original_image.astype( np.uint8)).convert('YCbCr')) # 4 w, h, _ = original_image.shape combined_yuv = np.empty((w, h, 3), dtype=np.uint8) combined_yuv[..., 0] = styled_grayscale_yuv[..., 0] combined_yuv[..., 1] = original_yuv[..., 1] combined_yuv[..., 2] = original_yuv[..., 2] # 5 styled_image = np.array( Image.fromarray(combined_yuv, 'YCbCr').convert('RGB')) plt.figure(figsize=(8, 8)) plt.imshow(styled_image.astype(np.uint8)) plt.axis('off') plt.show() images.append(styled_image.astype(np.uint8)) return images, best
style_batch = np.zeros([batch_size, 128, 128]) for i, single in enumerate(style_batch): style_batch[i] = ss ss = np.reshape(ss, [-1, 128, 128, 1]) style_features = {} # precompute style features with tf.Graph().as_default(), tf.device('/cpu:0'), tf.Session() as sess: style_image = tf.placeholder(tf.float32, shape=[None, 128, 128, 1], name='style_image') style_image_pre = vgg.preprocess(style_image) net = vgg.net(vgg_path, style_image_pre) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={style_image: ss}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[layer] = gram ''' stage 1: generate depth images from joints distribution ''' gen = Generator img_gen = gen.generator(X_in_label) img_gen_trans = tf.reshape(img_gen, [-1, 128, 128]) # loss_generator = tf.reduce_mean(tf.abs(img_gen_trans - X_in_image)) loss_generator = tf.reduce_mean(
def main(): content_path, style_path, width, style_scale = sys.argv[1:] width = int(width) style_scale = float(style_scale) content_image = imread(content_path) style_image = imread(style_path) if width > 0: new_shape = (int(math.floor(float(content_image.shape[0]) / content_image.shape[1] * width)), width) content_image = sm.imresize(content_image, new_shape) if style_scale > 0: style_image = sm.imresize(style_image, style_scale) shape = (1,) + content_image.shape style_shape = (1,) + style_image.shape content_features = {} style_features = {} g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net, mean_pixel = vgg.net(VGG_PATH, image) content_pre = np.array([vgg.preprocess(content_image, mean_pixel)]) content_features[CONTENT_LAYER] = net[CONTENT_LAYER].eval( feed_dict={image: content_pre}) g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shape) net, _ = vgg.net(VGG_PATH, image) style_pre = np.array([vgg.preprocess(style_image, mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / (features.size) style_features[layer] = gram with tf.Graph().as_default(): noise = np.random.normal(size=shape, scale=np.std(content_image) * 0.1) init = tf.random_normal(shape) * 256 / 1000 image = tf.Variable(init) net, _ = vgg.net(VGG_PATH, image) content_loss = tf.nn.l2_loss( net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) style_losses = [] for i in STYLE_LAYERS: layer = net[i] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / (size) style_gram = style_features[i] style_losses.append(tf.nn.l2_loss(gram - style_gram)) style_loss = reduce(tf.add, style_losses) / len(style_losses) tv_loss = (tf.nn.l2_loss(image[:,1:,:,:] - image[:,:shape[1]-1,:,:]) + tf.nn.l2_loss(image[:,:,1:,:] - image[:,:,:shape[2]-1,:])) loss = ALPHA * content_loss + BETA * style_loss + TV_WEIGHT * tv_loss train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for i in range(100000): print 'i = %d' % i if i % 10 == 0: print '\tcontent_loss = %15.0f' % content_loss.eval() print '\tstyle_loss = %15.0f' % style_loss.eval() print '\ttv_loss = %15.0f' % tv_loss.eval() print '\tloss = %15.0f' % loss.eval() imsave('%05d.jpg' % i, vgg.unprocess( image.eval().reshape(shape[1:]), mean_pixel)) train_step.run()
def main(): '''Search for similar images Search the style directory for images that closely resemble each image in the content directory. Save those images in an output directory folder corresponding to each content image, renamed as their matching rank number. ''' parser = build_parser() options = parser.parse_args() content_files = os.listdir(options.content_dir) content_images = [ read_img(os.path.join(options.content_dir, f)) for f in content_files ] # n_content by n_style matrix and list to store the best style images n_content = len(content_files) n_total = n_content * options.n_style best_style_score = np.float('inf') * np.ones((n_content, options.n_style)) best_style_file = np.array([['' for i in range(options.n_style)] for h in range(n_content)], dtype=object) vgg_weights, vgg_mean_pixel = vgg.load_net(options.network) content_features = [{} for _ in content_images] for i, c in enumerate(content_images): with tf.Graph().as_default(), tf.Session() as sess: image = tf.placeholder('float', shape=(1, ) + c.shape) net = vgg.net_preloaded(vgg_weights, image, 'max') content_pre = np.array([vgg.preprocess(c, vgg_mean_pixel)]) for layer in CONTENT_LAYERS: content_features[i][layer] = net[layer].eval( feed_dict={image: content_pre}) final_style_score, final_style_file = search_dir( content_features, vgg_weights, vgg_mean_pixel, best_style_score, best_style_file, options.style_dir, options.recurse, options.n_search) if np.any(np.isinf(final_style_score)): inf_total = np.sum(np.isinf(final_style_score)) print('%d out of %d style images not found.' % (inf_total, n_total), 'Try rerunning with a smaller n-style.') raise sorted_files = final_style_file[np.indices( (n_content, options.n_style))[0], final_style_score.argsort()] format_str = '{0:0>%d}.{1}' % np.ceil(np.log10(n_total)) os.mkdir(options.output_dir) for i, f in enumerate(content_files): fname = ''.join(f.split('.')[:-1]) print('Copying style files for %s' % fname) os.mkdir(os.path.join(options.output_dir, fname)) for j in range(options.n_style): print(sorted_files[i, j]) img_ext = sorted_files[i, j].split('.')[-1] shutil.copy( sorted_files[i, j], os.path.join(options.output_dir, fname, format_str.format(j, img_ext)))
def optimize(content_targets, style_target, content_weight, style_weight, tv_weight, vgg_path, epochs=2, print_iterations=1000, batch_size=4, save_path='saver/fns.ckpt', slow=False, learning_rate=1e-3, device='/cpu:0', debug=False, total_iterations=-1, base_model_path=None): if slow: batch_size = 1 mod = len(content_targets) % batch_size if mod > 0: print("Train set has been trimmed slightly..") content_targets = content_targets[:-mod] style_features = {} batch_shape = (batch_size, 256, 256, 3) style_shape = (1, ) + style_target.shape print(style_shape) # precompute style features print("Precomputing style features") sys.stdout.flush() with tf.Graph().as_default(), tf.device(device), tf.Session( config=tf.ConfigProto(allow_soft_placement=True)) as sess: style_image = tf.placeholder(tf.float32, shape=style_shape, name='style_image') style_image_pre = vgg.preprocess(style_image) net = vgg.net(vgg_path, style_image_pre) style_pre = np.array([style_target]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={style_image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[layer] = gram with tf.Graph().as_default(), tf.Session() as sess: X_content = tf.placeholder(tf.float32, shape=batch_shape, name="X_content") X_pre = vgg.preprocess(X_content) print("Precomputing content features") sys.stdout.flush() # precompute content features content_features = {} content_net = vgg.net(vgg_path, X_pre) content_features[CONTENT_LAYER] = content_net[CONTENT_LAYER] if slow: preds = tf.Variable( tf.random_normal(X_content.get_shape()) * 0.256) preds_pre = preds else: preds = transform.net(X_content / 255.0) preds_pre = vgg.preprocess(preds) print("Building VGG net") sys.stdout.flush() net = vgg.net(vgg_path, preds_pre) content_size = _tensor_size( content_features[CONTENT_LAYER]) * batch_size assert _tensor_size(content_features[CONTENT_LAYER]) == _tensor_size( net[CONTENT_LAYER]) content_loss = content_weight * ( 2 * tf.nn.l2_loss(net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_size) style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] bs, height, width, filters = map(lambda i: i.value, layer.get_shape()) size = height * width * filters feats = tf.reshape(layer, (bs, height * width, filters)) feats_T = tf.transpose(feats, perm=[0, 2, 1]) # see https://github.com/tensorflow/tensorflow/issues/6560 grams = tf.matmul(feats_T, feats) / size style_gram = style_features[style_layer] style_losses.append(2 * tf.nn.l2_loss(grams - style_gram) / style_gram.size) style_loss = style_weight * reduce(tf.add, style_losses) / batch_size # total variation denoising tv_y_size = _tensor_size(preds[:, 1:, :, :]) tv_x_size = _tensor_size(preds[:, :, 1:, :]) y_tv = tf.nn.l2_loss(preds[:, 1:, :, :] - preds[:, :batch_shape[1] - 1, :, :]) x_tv = tf.nn.l2_loss(preds[:, :, 1:, :] - preds[:, :, :batch_shape[2] - 1, :]) tv_loss = tv_weight * 2 * (x_tv / tv_x_size + y_tv / tv_y_size) / batch_size loss = content_loss + style_loss + tv_loss # overall loss train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) sess.run(tf.initialize_all_variables()) # If base model file is present, load that in to the session if base_model_path: saver = tf.train.Saver() if os.path.isdir(base_model_path): ckpt = tf.train.get_checkpoint_state(base_model_path) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) else: raise Exception("No checkpoint found...") else: saver.restore(sess, base_model_path) import random uid = random.randint(1, 100) print("UID: %s" % uid) sys.stdout.flush() for epoch in range(epochs): num_examples = len(content_targets) print("number of examples: %s" % num_examples) sys.stdout.flush() iterations = 0 while iterations * batch_size < num_examples: print("Current iteration : %s" % iterations) sys.stdout.flush() start_time = time.time() curr = iterations * batch_size step = curr + batch_size X_batch = np.zeros(batch_shape, dtype=np.float32) for j, img_p in enumerate(content_targets[curr:step]): X_batch[j] = get_img(img_p, (256, 256, 3)).astype(np.float32) iterations += 1 assert X_batch.shape[0] == batch_size feed_dict = {X_content: X_batch} train_step.run(feed_dict=feed_dict) end_time = time.time() delta_time = end_time - start_time if debug: print("UID: %s, batch time: %s" % (uid, delta_time)) is_print_iter = int(iterations) % print_iterations == 0 if slow: is_print_iter = epoch % print_iterations == 0 is_last = False if epoch == epochs - 1 and iterations * batch_size >= num_examples: is_last = True if total_iterations > 0 and iterations >= total_iterations: is_last = True should_print = is_print_iter or is_last if should_print: to_get = [style_loss, content_loss, tv_loss, loss, preds] test_feed_dict = {X_content: X_batch} tup = sess.run(to_get, feed_dict=test_feed_dict) _style_loss, _content_loss, _tv_loss, _loss, _preds = tup losses = (_style_loss, _content_loss, _tv_loss, _loss) if slow: _preds = vgg.unprocess(_preds) else: saver = tf.train.Saver() res = saver.save(sess, save_path) yield (_preds, losses, iterations, epoch) if is_last: break
def optimize(content_targets, style_target, content_weight, style_weight, tv_weight, vgg_path, epochs=2, print_iterations=1000, batch_size=4, save_path='saver/fns.ckpt', slow=False, learning_rate=1e-3, debug=False, max_sample=4000): if slow: batch_size = 1 mod = len(content_targets) % batch_size if mod > 0: print("Train set has been trimmed slightly..") content_targets = content_targets[:-mod] if len(content_targets) > max_sample: content_targets = content_targets[:max_sample] style_features = {} batch_shape = (batch_size,256,256,3) style_shape = (1,) + style_target.shape print(style_shape) # precompute style features with tf.Graph().as_default(), tf.device('/cpu:0'), tf.Session() as sess: style_image = tf.placeholder(tf.float32, shape=style_shape, name='style_image') style_image_pre = vgg.preprocess(style_image) net = vgg.net(vgg_path, style_image_pre) style_pre = np.array([style_target]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={style_image:style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[layer] = gram with tf.Graph().as_default(), tf.Session() as sess: X_content = tf.placeholder(tf.float32, shape=batch_shape, name="X_content") X_pre = vgg.preprocess(X_content) # precompute content features content_features = {} content_net = vgg.net(vgg_path, X_pre) content_features[CONTENT_LAYER] = content_net[CONTENT_LAYER] if slow: preds = tf.Variable( tf.random_normal(X_content.get_shape()) * 0.256 ) preds_pre = preds else: preds = transform.net(X_content/255.0) preds_pre = vgg.preprocess(preds) net = vgg.net(vgg_path, preds_pre) content_size = _tensor_size(content_features[CONTENT_LAYER])*batch_size assert _tensor_size(content_features[CONTENT_LAYER]) == _tensor_size(net[CONTENT_LAYER]) content_loss = content_weight * (2 * tf.nn.l2_loss( net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_size ) style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] bs, height, width, filters = map(lambda i:i.value,layer.get_shape()) size = height * width * filters feats = tf.reshape(layer, (bs, height * width, filters)) feats_T = tf.transpose(feats, perm=[0,2,1]) grams = tf.matmul(feats_T, feats) / size style_gram = style_features[style_layer] style_losses.append(2 * tf.nn.l2_loss(grams - style_gram)/style_gram.size) style_loss = style_weight * functools.reduce(tf.add, style_losses) / batch_size # total variation denoising tv_y_size = _tensor_size(preds[:,1:,:,:]) tv_x_size = _tensor_size(preds[:,:,1:,:]) y_tv = tf.nn.l2_loss(preds[:,1:,:,:] - preds[:,:batch_shape[1]-1,:,:]) x_tv = tf.nn.l2_loss(preds[:,:,1:,:] - preds[:,:,:batch_shape[2]-1,:]) tv_loss = tv_weight*2*(x_tv/tv_x_size + y_tv/tv_y_size)/batch_size loss = content_loss + style_loss + tv_loss # overall loss train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) sess.run(tf.initialize_all_variables()) import random uid = random.randint(1, 100) print("UID: %s" % uid) for epoch in range(epochs): num_examples = len(content_targets) iterations = 0 while iterations * batch_size < num_examples: start_time = time.time() curr = iterations * batch_size step = curr + batch_size X_batch = np.zeros(batch_shape, dtype=np.float32) for j, img_p in enumerate(content_targets[curr:step]): X_batch[j] = get_img(img_p, (256,256,3)).astype(np.float32) iterations += 1 assert X_batch.shape[0] == batch_size feed_dict = { X_content:X_batch } train_step.run(feed_dict=feed_dict) end_time = time.time() delta_time = end_time - start_time if debug: print("UID: %s, batch time: %s" % (uid, delta_time)) is_print_iter = int(iterations) % print_iterations == 0 if slow: is_print_iter = epoch % print_iterations == 0 is_last = epoch == epochs - 1 and iterations * batch_size >= num_examples should_print = is_print_iter or is_last if should_print: to_get = [style_loss, content_loss, tv_loss, loss, preds] test_feed_dict = { X_content:X_batch } tup = sess.run(to_get, feed_dict = test_feed_dict) _style_loss,_content_loss,_tv_loss,_loss,_preds = tup losses = (_style_loss, _content_loss, _tv_loss, _loss) if slow: _preds = vgg.unprocess(_preds) else: saver = tf.train.Saver() res = saver.save(sess, save_path) yield(_preds, losses, iterations, epoch)
def main(): # This will print all array values in full np.set_printoptions(threshold=np.nan) parser = build_parser() options = parser.parse_args() if not os.path.isfile(options.network): parser.error( "Network %s does not exist. (Did you forget to download it?)" % options.network) # Load the vgg weights in advance vgg_weights, vgg_mean_pixel = vgg.load_net(options.network) content_image = imread(options.content) # Jacob: moved this here since the same image features will be used for each style image content_features = {} g = tf.Graph() shape = (1, ) + content_image.shape with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net = vgg.net_preloaded(vgg_weights, image, options.pooling) content_pre = np.array([vgg.preprocess(content_image, vgg_mean_pixel)]) for layer in CONTENT_LAYERS: content_features[layer] = net[layer].eval( feed_dict={image: content_pre}) print("READY") sys.stdout.flush( ) # Make sure Java can sense this output before Python blocks waiting for input count = 0 #for style in style_images: # loop through separate style inputs individually for line in sys.stdin: # Assumes a single line of input will be a json for one image style = jsonimread(line) width = options.width if width is not None: new_shape = (int( math.floor( float(content_image.shape[0]) / content_image.shape[1] * width)), width) content_image = scipy.misc.imresize(content_image, new_shape) target_shape = content_image.shape # This batch of code was in a loop for each style input before style_scale = STYLE_SCALE if options.style_scales is not None: style_scale = options.style_scales[i] style = scipy.misc.imresize( style, style_scale * target_shape[1] / style.shape[1]) # Removed code for blanding between multiple styles style_blend_weights = [1.0] initial = options.initial if initial is not None: initial = scipy.misc.imresize(imread(initial), content_image.shape[:2]) # Initial guess is specified, but not noiseblend - no noise should be blended if options.initial_noiseblend is None: options.initial_noiseblend = 0.0 else: # Neither inital, nor noiseblend is provided, falling back to random generated initial guess if options.initial_noiseblend is None: options.initial_noiseblend = 1.0 if options.initial_noiseblend < 1.0: initial = content_image if options.checkpoint_output and "%s" not in options.checkpoint_output: parser.error("To save intermediate images, the checkpoint output " "parameter must contain `%s` (e.g. `foo%s.jpg`)") for iteration, image in stylize( network=options.network, initial=initial, initial_noiseblend=options.initial_noiseblend, content=content_image, styles=[style ], # Changed this to be a list of only one style image preserve_colors=options.preserve_colors, iterations=options.iterations, content_weight=options.content_weight, content_weight_blend=options.content_weight_blend, style_weight=options.style_weight, style_layer_weight_exp=options.style_layer_weight_exp, style_blend_weights=style_blend_weights, tv_weight=options.tv_weight, learning_rate=options.learning_rate, beta1=options.beta1, beta2=options.beta2, epsilon=options.epsilon, pooling=options.pooling, print_iterations=options.print_iterations, checkpoint_iterations=options.checkpoint_iterations, # These vgg settings are now loaded only once vgg_weights=vgg_weights, vgg_mean_pixel=vgg_mean_pixel, content_features=content_features): output_file = None combined_rgb = image if iteration is not None: if options.checkpoint_output: output_file = options.checkpoint_output % iteration else: # Change final output files to simply be numbered output_file = "%d.JPG" % count count = count + 1 if output_file: # No longer save image to file #imsave(output_file, combined_rgb) # Output json String print(json.dumps(combined_rgb.tolist())) sys.stdout.flush( ) # Make sure Java can sense this output before Python blocks waiting for input print("DONE")
def stylize(network, initial, initial_noiseblend, content, styles, luminance_transfer, iterations, content_weight, content_weight_blend, style_weight, style_layer_weight_exp, style_blend_weights, tv_weight, learning_rate, beta1, beta2, epsilon, pooling, print_iterations=None, checkpoint_iterations=None): """ This function yields tuples (iteration, image). `iteration` is None if this is the final image (the last iteration). Other tuples are yielded every `checkpoint_iterations` iterations. """ shape = (1, ) + content.shape style_shapes = [(1, ) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] vgg_weights, vgg_mean_pixel = vgg.load_net(network) layer_weight = 1.0 style_layers_weights = {} for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = layer_weight layer_weight *= style_layer_weight_exp # normalize style layer weights layer_weights_sum = 0 for style_layer in STYLE_LAYERS: layer_weights_sum += style_layers_weights[style_layer] for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] /= layer_weights_sum # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net = vgg.net_preloaded(vgg_weights, image, pooling) content_pre = np.array([vgg.preprocess(content, vgg_mean_pixel)]) for layer in CONTENT_LAYERS: content_features[layer] = net[layer].eval( feed_dict={image: content_pre}) # compute style features in feedforward mode for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shapes[i]) net = vgg.net_preloaded(vgg_weights, image, pooling) style_pre = np.array([vgg.preprocess(styles[i], vgg_mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[i][layer] = gram initial_content_noise_coeff = 1.0 - initial_noiseblend # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, vgg_mean_pixel)]) initial = initial.astype('float32') noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = (initial) * initial_content_noise_coeff + ( tf.random_normal(shape) * 0.256) * (1.0 - initial_content_noise_coeff) image = tf.Variable(initial) net = vgg.net_preloaded(vgg_weights, image, pooling) # content loss content_layers_weights = { 'relu4_2': content_weight_blend, 'relu5_2': 1.0 - content_weight_blend } content_loss = 0 content_losses = [] for content_layer in CONTENT_LAYERS: content_losses.append( content_layers_weights[content_layer] * content_weight * (2 * tf.nn.l2_loss(net[content_layer] - content_features[content_layer]) / content_features[content_layer].size)) content_loss += reduce(tf.add, content_losses) # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / size style_gram = style_features[i][style_layer] style_losses.append(style_layers_weights[style_layer] * 2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weights[i] * reduce( tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:, 1:, :, :]) tv_x_size = _tensor_size(image[:, :, 1:, :]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:, 1:, :, :] - image[:, :shape[1] - 1, :, :]) / tv_y_size) + (tf.nn.l2_loss(image[:, :, 1:, :] - image[:, :, :shape[2] - 1, :]) / tv_x_size)) # overall loss loss = content_loss + style_loss + tv_loss # optimizer setup train_step = tf.train.AdamOptimizer(learning_rate, beta1, beta2, epsilon).minimize(loss) def print_progress(): stderr.write(' content loss: %g\n' % content_loss.eval()) stderr.write(' style loss: %g\n' % style_loss.eval()) stderr.write(' tv loss: %g\n' % tv_loss.eval()) stderr.write(' total loss: %g\n' % loss.eval()) # optimization best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.global_variables_initializer()) stderr.write('Optimization started...\n') if (print_iterations and print_iterations != 0): print_progress() for i in tqdm(range(iterations)): #stderr.write('Iteration %4d/%4d\n' % (i + 1, iterations)) train_step.run() last_step = (i == iterations - 1) if last_step or (print_iterations and i % print_iterations == 0): print_progress() if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() img_out = vgg.unprocess(best.reshape(shape[1:]), vgg_mean_pixel) if luminance_transfer and luminance_transfer == True: original_image = np.clip(content, 0, 255) styled_image = np.clip(img_out, 0, 255) # Luminosity transfer steps: # 1. Convert stylized image into YUV (YCbCr) # 2. Convert original image into YUV (YCbCr) # 3. Recombine (stylizedYUV.Y, originalYUV.U, originalYUV.V) # 4. Convert recombined image from YUV back to RGB # 1 styled_grayscale_yuv = np.array( Image.fromarray(styled_image.astype( np.uint8)).convert('YCbCr')) # 2 original_yuv = np.array( Image.fromarray(original_image.astype( np.uint8)).convert('YCbCr')) # 3 h, w, _ = original_image.shape combined_yuv = np.empty((h, w, 3), dtype=np.uint8) combined_yuv[..., 0] = styled_grayscale_yuv[..., 0] combined_yuv[..., 1] = original_yuv[..., 1] combined_yuv[..., 2] = original_yuv[..., 2] # 4 img_out = np.array( Image.fromarray(combined_yuv, 'YCbCr').convert('RGB')) yield ((None if last_step else i), img_out)
def main(): global options, device # Get the ENV context script_dir = os.path.dirname(__file__) env = os.environ.copy() # Set the input folder input_dir = os.path.expanduser(options.input_dir) if options.input_dir \ else os.path.join(script_dir, '..', 'data') vgg_path = os.path.join(input_dir, 'vgg', 'imagenet-vgg-verydeep-19.mat') coco_dir = os.path.join(input_dir, 'train') if not os.path.isdir(input_dir): fail('Failed to find the input folder at ' + input_dir) if not os.path.isfile(vgg_path): error('Failed to find the VGG model file at ' + vgg_path) fail( 'Please download it from http://www.vlfeat.org/matconvnet/models/beta16/imagenet-vgg-verydeep-19.mat' ) if not os.path.isdir(coco_dir): error('Failed to find the COCO 2014 training images in ' + coco_dir) fail( 'Plese download it from http://images.cocodataset.org/zips/train2014.zip' ) # Set the output folder output_dir = os.path.expanduser(options.output_dir) if options.output_dir \ else env.get('OUTPUT_DIR', os.path.join(script_dir, '..', 'output')) model_dir = os.path.join(output_dir, 'checkpoint') export_dir = os.path.join(output_dir, 'savedmodel') if os.path.isdir(output_dir): if not os.path.isdir(model_dir): info('Creating a folder to store checkpoint at ' + model_dir) os.makedirs(model_dir) if os.path.isdir(export_dir): info('Deleting the folder containing SavedModel at ' + export_dir) shutil.rmtree(export_dir) else: info('Creating a folder to store checkpoint at ' + model_dir) os.makedirs(model_dir) # Set the TensorBoard folder log_dir = os.path.expanduser(options.log_dir) if options.log_dir \ else env.get('LOG_DIR', os.path.join(script_dir, '..', 'log')) if not os.path.isdir(log_dir): info('Creating a folder to store TensorBoard events at ' + log_dir) os.makedirs(log_dir) # Set the style image path style_path = os.path.expanduser(options.style_image) if os.path.isfile(options.style_image) \ else os.path.join(input_dir, 'style_images', options.style_image) style_name = os.path.basename(os.path.splitext(style_path)[0]) ckpt_path = os.path.join(model_dir, style_name + '.ckpt') if not os.path.isfile(style_path): fail('Failed to find the style image at ' + style_path) # Set hyper parameters batch_size = options.batch_size epochs = options.epoch lr = options.lr lambda_tv = options.lambda_tv lambda_feat = options.lambda_feat lambda_style = options.lambda_style # Print parsed arguments info('--------- Training parameters -------->') info('Style image path: ' + style_path) info('VGG model path: ' + vgg_path) info('Training image dir: ' + coco_dir) info('Checkpoint path: ' + ckpt_path) info('TensorBoard log dir: ' + log_dir) info('Training device: ' + device) info('Batch size: %d' % batch_size) info('Epoch count: %d' % epochs) info('Learning rate: ' + str(lr)) info('Lambda tv: ' + str(lambda_tv)) info('Lambda feat: ' + str(lambda_feat)) info('Lambda style: ' + str(lambda_style)) info('<-------- Training parameters ---------') # COCO images to train content_targets = list_jpgs(coco_dir) if len(content_targets) % batch_size != 0: content_targets = content_targets[:-(len(content_targets) % batch_size)] info('Total training data size: %d' % len(content_targets)) # Image shape image_shape = (224, 224, 3) batch_shape = (batch_size, ) + image_shape # Style target style_target = read_img(style_path) style_shape = (1, ) + style_target.shape with tf.device(device), tf.Session() as sess: # Compute gram maxtrix of style target style_image = tf.placeholder(tf.float32, shape=style_shape, name='style_image') vggstyletarget = vgg.net(vgg_path, vgg.preprocess(style_image)) style_vgg = vgg.get_style_vgg(vggstyletarget, style_image, np.array([style_target])) # Content target feature content_vgg = {} inputs = tf.placeholder(tf.float32, shape=batch_shape, name='inputs') content_net = vgg.net(vgg_path, vgg.preprocess(inputs)) content_vgg['relu4_2'] = content_net['relu4_2'] # Feature after transformation outputs = stylenet.net(inputs / 255.0) vggoutputs = vgg.net(vgg_path, vgg.preprocess(outputs)) # Compute feature loss loss_f = options.lambda_feat * vgg.total_content_loss( vggoutputs, content_vgg, batch_size) # Compute style loss loss_s = options.lambda_style * vgg.total_style_loss( vggoutputs, style_vgg, batch_size) # Total variation denoising loss_tv = options.lambda_tv * vgg.total_variation_regularization( outputs, batch_size, batch_shape) # Total loss total_loss = loss_f + loss_s + loss_tv train_step = tf.train.AdamOptimizer(options.lr).minimize(total_loss) # Create summary tf.summary.scalar('loss', total_loss) merged = tf.summary.merge_all() # Used to save model saver = tf.train.Saver() builder = tf.saved_model.builder.SavedModelBuilder(export_dir) with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: # Restore checkpoint if available sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt and ckpt.model_checkpoint_path: info('Restoring from ' + ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) # Write the graph writer = tf.summary.FileWriter(log_dir, sess.graph) # Start to train total_step = 0 for epoch in range(epochs): info('epoch: %d' % epoch) step = 0 while step * batch_size < len(content_targets): time_start = time.time() # Load one batch batch = np.zeros(batch_shape, dtype=np.float32) for i, img in enumerate( content_targets[step * batch_size:(step + 1) * batch_size]): batch[i] = read_img(img, image_shape).astype( np.float32) # (224,224,3) # Proceed one step step += 1 total_step += 1 _, loss, summary = sess.run([train_step, total_loss, merged], feed_dict={inputs: batch}) time_elapse = time.time() - time_start if total_step % 5 == 0: info('[step {}] elapse time: {} loss: {}'.format( total_step, time_elapse, loss)) writer.add_summary(summary, total_step) # Write checkpoint if total_step % 2000 == 0: info('Saving checkpoint to ' + ckpt_path) saver.save(sess, ckpt_path, global_step=total_step) info('Exporting SavedModel to ' + export_dir) serving_signatures = { 'Transfer': #tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.saved_model.signature_def_utils.predict_signature_def( { tf.saved_model.signature_constants.PREDICT_INPUTS: inputs }, { tf.saved_model.signature_constants.PREDICT_OUTPUTS: outputs } ) } builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map=serving_signatures, clear_devices=True) builder.save()
def stylize(network, initial, initial_noiseblend, content, styles, preserve_colors, iterations, content_weight, content_weight_blend, style_weight, style_layer_weight_exp, style_blend_weights, tv_weight, learning_rate, beta1, beta2, epsilon, pooling, print_iterations=None, checkpoint_iterations=None): """ Stylize images. This function yields tuples (iteration, image); `iteration` is None if this is the final image (the last iteration). Other tuples are yielded every `checkpoint_iterations` iterations. :rtype: iterator[tuple[int|None,image]] """ shape = (1,) + content.shape style_shapes = [(1,) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] vgg_weights, vgg_mean_pixel = vgg.load_net(network) layer_weight = 1.0 style_layers_weights = {} for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = layer_weight layer_weight *= style_layer_weight_exp # normalize style layer weights layer_weights_sum = 0 for style_layer in STYLE_LAYERS: layer_weights_sum += style_layers_weights[style_layer] for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] /= layer_weights_sum # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net = vgg.net_preloaded(vgg_weights, image, pooling) content_pre = np.array([vgg.preprocess(content, vgg_mean_pixel)]) for layer in CONTENT_LAYERS: content_features[layer] = net[layer].eval(feed_dict={image: content_pre}) # compute style features in feedforward mode for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shapes[i]) net = vgg.net_preloaded(vgg_weights, image, pooling) style_pre = np.array([vgg.preprocess(styles[i], vgg_mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[i][layer] = gram initial_content_noise_coeff = 1.0 - initial_noiseblend # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, vgg_mean_pixel)]) initial = initial.astype('float32') noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = (initial) * initial_content_noise_coeff + (tf.random_normal(shape) * 0.256) * (1.0 - initial_content_noise_coeff) image = tf.Variable(initial) net = vgg.net_preloaded(vgg_weights, image, pooling) # content loss content_layers_weights = {} content_layers_weights['relu4_2'] = content_weight_blend content_layers_weights['relu5_2'] = 1.0 - content_weight_blend content_loss = 0 content_losses = [] for content_layer in CONTENT_LAYERS: content_losses.append(content_layers_weights[content_layer] * content_weight * (2 * tf.nn.l2_loss( net[content_layer] - content_features[content_layer]) / content_features[content_layer].size)) content_loss += reduce(tf.add, content_losses) # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / size style_gram = style_features[i][style_layer] style_losses.append(style_layers_weights[style_layer] * 2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weights[i] * reduce(tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:,1:,:,:]) tv_x_size = _tensor_size(image[:,:,1:,:]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:,1:,:,:] - image[:,:shape[1]-1,:,:]) / tv_y_size) + (tf.nn.l2_loss(image[:,:,1:,:] - image[:,:,:shape[2]-1,:]) / tv_x_size)) # overall loss loss = content_loss + style_loss + tv_loss # optimizer setup train_step = tf.train.AdamOptimizer(learning_rate, beta1, beta2, epsilon).minimize(loss) def print_progress(): stderr.write(' content loss: %g\n' % content_loss.eval()) stderr.write(' style loss: %g\n' % style_loss.eval()) stderr.write(' tv loss: %g\n' % tv_loss.eval()) stderr.write(' total loss: %g\n' % loss.eval()) # optimization best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.global_variables_initializer()) stderr.write('Optimization started...\n') if (print_iterations and print_iterations != 0): print_progress() iteration_times = [] start = time.time() for i in range(iterations): iteration_start = time.time() if i > 0: elapsed = time.time() - start # take average of last couple steps to get time per iteration remaining = np.mean(iteration_times[-10:]) * (iterations - i) stderr.write('Iteration %4d/%4d (%s elapsed, %s remaining)\n' % ( i + 1, iterations, hms(elapsed), hms(remaining) )) else: stderr.write('Iteration %4d/%4d\n' % (i + 1, iterations)) train_step.run() last_step = (i == iterations - 1) if last_step or (print_iterations and i % print_iterations == 0): print_progress() if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() img_out = vgg.unprocess(best.reshape(shape[1:]), vgg_mean_pixel) if preserve_colors and preserve_colors == True: original_image = np.clip(content, 0, 255) styled_image = np.clip(img_out, 0, 255) # Luminosity transfer steps: # 1. Convert stylized RGB->grayscale accoriding to Rec.601 luma (0.299, 0.587, 0.114) # 2. Convert stylized grayscale into YUV (YCbCr) # 3. Convert original image into YUV (YCbCr) # 4. Recombine (stylizedYUV.Y, originalYUV.U, originalYUV.V) # 5. Convert recombined image from YUV back to RGB # 1 styled_grayscale = rgb2gray(styled_image) styled_grayscale_rgb = gray2rgb(styled_grayscale) # 2 styled_grayscale_yuv = np.array(Image.fromarray(styled_grayscale_rgb.astype(np.uint8)).convert('YCbCr')) # 3 original_yuv = np.array(Image.fromarray(original_image.astype(np.uint8)).convert('YCbCr')) # 4 w, h, _ = original_image.shape combined_yuv = np.empty((w, h, 3), dtype=np.uint8) combined_yuv[..., 0] = styled_grayscale_yuv[..., 0] combined_yuv[..., 1] = original_yuv[..., 1] combined_yuv[..., 2] = original_yuv[..., 2] # 5 img_out = np.array(Image.fromarray(combined_yuv, 'YCbCr').convert('RGB')) yield ( (None if last_step else i), img_out ) iteration_end = time.time() iteration_times.append(iteration_end - iteration_start)
def optimize(content_targets, style_target, content_weight, style_weight, tv_weight, vgg_path, use_IN, epochs=2, print_iterations=1000, batch_size=4, save_path='checkpoints/fast_style_transfer.ckpt', slow=False, learning_rate=1e-3, debug=False): if slow: batch_size = 1 # content_target is a list of files, 4-D size, so this is about the batch size here. # If using only one content image, then mod here is 0. mod = len(content_targets) % batch_size if mod > 0: print("Train set has been trimmed slightly...") content_targets = content_targets[:-mod] # training image get to be 256 x 256 because of get_img resize, # it then get into tensorflow graph from Adam optimizer feed_dict. batch_shape = (batch_size, 256, 256, 3) style_shape = (1,) + style_target.shape # add 1 in the front for batch size, 4-D. print(f"batch_shape of the content image is: {batch_shape}") print(f"style_shape of the style image is: {style_shape}") ### Graph Construction ### # vgg won't be trained, because in vgg.py the weights are loaded through that matlab file. # computed vgg style features in gram matrices # tf.device('/cpu:0') config = v1.ConfigProto() config.gpu_options.allow_growth = True style_features = {} with tf.Graph().as_default(), v1.Session(config=config) as sess: style_image = v1.placeholder(tf.float32, shape=style_shape, name='style_image') # 4-D placeholder for feed_dict vgg_style_net = vgg.net(vgg_path, vgg.preprocess(style_image)) # extract feature volume np_style_target = np.array([style_target]) # a 3-D numpy array for feed_dict's input for layer in STYLE_LAYERS: # vgg_style_net[layer] is a tf.Tensor returned by tf.nn.relu, # eval at that layer, by running forward to that vgg layer or entire network. features = vgg_style_net[layer].eval(feed_dict={style_image:np_style_target}) # extract a fVol value features = np.reshape(features, (-1, features.shape[3])) # (N*H*W, C) gram = np.matmul(features.T, features) / features.size style_features[layer] = gram # computed vgg content feature map and both losses with tf.Graph().as_default(), v1.Session(config=config) as sess: X_content = v1.placeholder(tf.float32, shape=batch_shape, name="X_content") # 4-D vgg_content_net = vgg.net(vgg_path, vgg.preprocess(X_content)) # run ground truth image through the pre-trained model # noisy prediction image runs through feed forward conv net, then # run through vgg to extract feature volume predicitons if slow: preds = tf.Variable( tf.random.normal(X_content.get_shape()) * 0.256 ) preds_pre = preds else: preds = transform.net(X_content/255.0, use_IN) # run through the style feed forward network. why need to normalize pixel to 0-1? net = vgg.net(vgg_path, vgg.preprocess(preds)) # run generated image through the pre-trained model # _tensor_size is a reduce function only count from [1:], # so it doesn't have batch_size information. content_size = _tensor_size(vgg_content_net[CONTENT_LAYER]) * batch_size vgg_content_net_size = _tensor_size(vgg_content_net[CONTENT_LAYER]) vgg_transform_content_net_size = _tensor_size(net[CONTENT_LAYER]) # print(f"vgg_content_net_size is {vgg_content_net_size}") # print(vgg_content_net[CONTENT_LAYER]) # print(f"vgg_transform_content_net_size is {vgg_transform_content_net_size}") # print(net[CONTENT_LAYER]) assert vgg_content_net_size == vgg_transform_content_net_size # define loss functions # content loss content_l2_loss = 2 * tf.nn.l2_loss(net[CONTENT_LAYER] - vgg_content_net[CONTENT_LAYER]) content_loss = content_weight * (content_l2_loss / content_size) # style loss style_l2_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] N, H, W, C = map(lambda i : i, layer.get_shape()) feats = tf.reshape(layer, (N, H*W, C)) # N, HW, C feats_T = tf.transpose(feats, perm=[0, 2, 1]) # N, C, HW pred_gram = tf.matmul(feats_T, feats) / (H * W * C) true_gram = style_features[style_layer] # numpy array style_l2_loss = 2 * tf.nn.l2_loss(pred_gram - true_gram) style_l2_losses.append(style_l2_loss / true_gram.size) style_loss = style_weight * functools.reduce(tf.add, style_l2_losses) / batch_size # total variation denoising regularization loss # test if not needed in NN conv case and mirror padding # tv_y_size = _tensor_size(preds[:,1:,:,:]) # tv_x_size = _tensor_size(preds[:,:,1:,:]) # # N, H, W, C # y_tv = 2 * tf.nn.l2_loss(preds[:, 1:, :, :] - preds[:, :batch_shape[1]-1, :, :]) # H, down - up # x_tv = 2 * tf.nn.l2_loss(preds[:, :, 1:, :] - preds[:, :, :batch_shape[2]-1, :]) # W, right - left # tv_loss = tv_weight * (x_tv/tv_x_size + y_tv/tv_y_size) / batch_size # total loss # total_loss = content_loss + style_loss + tv_loss total_loss = content_loss + style_loss # train the feed forward net, and save weights to a checkpoint. import random uid = random.randint(1, 100) print("This random UID is: %s" % uid) optimizer = v1.train.AdamOptimizer(learning_rate).minimize(total_loss) sess.run(v1.global_variables_initializer()) for epoch in range(epochs): # epoch loop iterations = 0 num_examples = len(content_targets) # COCO train2014 ~20000 images while iterations * batch_size < num_examples: # batch loop # start training a batch start_time = time.time() X_batch = np.zeros(batch_shape, dtype=np.float32) start = iterations * batch_size end = iterations * batch_size + batch_size for i, img_p in enumerate(content_targets[start:end]): # img_p is a coco images X_batch[i] = get_img(img_p, (256,256,3)).astype(np.float32) # resize to 256 x 256 optimizer.run(feed_dict={X_content:X_batch}) end_time = time.time() # end training a batch # update training information iterations += 1 is_print_iter = int(iterations) % print_iterations == 0 is_last_train = epoch == epochs - 1 and iterations * batch_size >= num_examples if slow: is_print_iter = epoch % print_iterations == 0 if debug: print("UID: %s, batch training time: %s" % (uid, end_time - start_time)) # monitor the training losses if is_print_iter or is_last_train: _style_loss, _content_loss, _total_loss, _preds = \ sess.run([style_loss, content_loss, total_loss, preds], feed_dict={X_content:X_batch}) losses = (_style_loss, _content_loss, _total_loss) generated_image = _preds if slow: generated_image = vgg.unprocess(generated_image) else: res = v1.train.Saver().save(sess, save_path) print("yield") yield(generated_image, losses, iterations, epoch)
def stylize(network, initial, content, styles, iterations, content_weight, style_weight, style_blend_weights, tv_weight, learning_rate, print_iterations=None, checkpoint_iterations=None): """ Stylize images. This function yields tuples (iteration, image); `iteration` is None if this is the final image (the last iteration). Other tuples are yielded every `checkpoint_iterations` iterations. :rtype: iterator[tuple[int|None,image]] """ shape = (1,) + content.shape style_shapes = [(1,) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net, mean_pixel = vgg.net(network, image) content_pre = np.array([vgg.preprocess(content, mean_pixel)]) content_features[CONTENT_LAYER] = net[CONTENT_LAYER].eval( feed_dict={image: content_pre}) # compute style features in feedforward mode for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shapes[i]) net, _ = vgg.net(network, image) style_pre = np.array([vgg.preprocess(styles[i], mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[i][layer] = gram # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, mean_pixel)]) initial = initial.astype('float32') image = tf.Variable(initial) net, _ = vgg.net(network, image) # content loss content_loss = content_weight * (2 * tf.nn.l2_loss( net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_features[CONTENT_LAYER].size) # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / size style_gram = style_features[i][style_layer] style_losses.append(2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weights[i] * reduce(tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:,1:,:,:]) tv_x_size = _tensor_size(image[:,:,1:,:]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:,1:,:,:] - image[:,:shape[1]-1,:,:]) / tv_y_size) + (tf.nn.l2_loss(image[:,:,1:,:] - image[:,:,:shape[2]-1,:]) / tv_x_size)) # overall loss loss = content_loss + style_loss + tv_loss # optimizer setup train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) def print_progress(i, last=False): global timenow stderr.write('Iteration %d/%d, time: %dms\n' % (i + 1, iterations, current_milli_time() - timenow)) timenow = current_milli_time() if last or (print_iterations and i % print_iterations == 0): stderr.write(' content loss: %g\n' % content_loss.eval()) stderr.write(' style loss: %g\n' % style_loss.eval()) stderr.write(' tv loss: %g\n' % tv_loss.eval()) stderr.write(' total loss: %g\n' % loss.eval()) # optimization best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for i in range(iterations): last_step = (i == iterations - 1) print_progress(i, last=last_step) train_step.run() if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() yield ( (None if last_step else i), vgg.unprocess(best.reshape(shape[1:]), mean_pixel) )
def model_neural_style(pre_train_vgg_path, content_image, style_images, content_weight=5e0, content_weight_blend=1.0, style_weight=5e2, style_layer_weight_exp=1.0, pooling='', initial=None, initial_noiseblend=1.0, tv_weight=1e2, learning_rate=1e1, beta1=0.9, beta2=0.999, epsilon=1e-08, print_iterations=None, iterations=500, checkpoint_iterations=50, preserve_colors=None): print "++++++++++++++++++++" # input shape of model shape = (1, ) + content_image.shape style_images_shapes = [(1, ) + style_image.shape for style_image in style_images] content_features = {} style_features = [{} for _ in style_images] # load the weights of pretrained vgg model vgg_weights, vgg_mean_pixel = vgg.load_weights(pre_train_vgg_path) layer_weight = 1.0 style_layers_weights = {} for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] = layer_weight layer_weight *= style_layer_weight_exp # normalize style layer weights layer_weights_sum = 0 for style_layer in STYLE_LAYERS: layer_weights_sum += style_layers_weights[style_layer] for style_layer in STYLE_LAYERS: style_layers_weights[style_layer] /= layer_weights_sum # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net = vgg.net_infer(vgg_weights, image, pooling) content_pre = np.array([vgg.preprocess(content_image, vgg_mean_pixel)]) for layer in CONTENT_LAYERS: content_features[layer] = net[layer].eval( feed_dict={image: content_pre}) # # for debug # for layer in CONTENT_LAYERS: # item = content_features[layer] # item = item.reshape(item.shape[1], item.shape[2], item.shape[3]) # item_for_plot = [] # for i in range(item.shape[2]): # item_for_plot.append(item[:, :, i]) # # tools.show_images(item_for_plot[::8], cols=8) # compute style features in feedforward mode # compute styles features in feedforward mode for i in range(len(style_images)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_images_shapes[i]) net = vgg.net_infer(vgg_weights, image, pooling) style_pre = np.array( [vgg.preprocess(style_images[i], vgg_mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[i][layer] = gram initial_content_noise_coeff = 1.0 - initial_noiseblend # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content_image) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, vgg_mean_pixel)]) initial = initial.astype('float32') noise = np.random.normal(size=shape, scale=np.std(content_image) * 0.1) initial = (initial) * initial_content_noise_coeff + ( tf.random_normal(shape) * 0.256) * (1.0 - initial_content_noise_coeff) image = tf.Variable(initial) net = vgg.net_infer(vgg_weights, image, pooling) # content loss content_layers_weights = {} content_layers_weights['relu4_2'] = content_weight_blend content_layers_weights['relu5_2'] = 1.0 - content_weight_blend content_loss = 0 content_losses = [] for content_layer in CONTENT_LAYERS: content_losses.append( content_layers_weights[content_layer] * content_weight * (2 * tf.nn.l2_loss(net[content_layer] - content_features[content_layer]) / content_features[content_layer].size)) content_loss += reduce(tf.add, content_losses) # style loss style_loss = 0 for i in range(len(style_images)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / size style_gram = style_features[i][style_layer] style_losses.append(style_layers_weights[style_layer] * 2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weights[i] * reduce( tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:, 1:, :, :]) tv_x_size = _tensor_size(image[:, :, 1:, :]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:, 1:, :, :] - image[:, :shape[1] - 1, :, :]) / tv_y_size) + (tf.nn.l2_loss(image[:, :, 1:, :] - image[:, :, :shape[2] - 1, :]) / tv_x_size)) # overall loss loss = content_loss + style_loss + tv_loss # optimizer setup train_step = tf.train.AdamOptimizer(learning_rate, beta1, beta2, epsilon).minimize(loss) def print_progress(): stderr.write(' content loss: %g\n' % content_loss.eval()) stderr.write(' style loss: %g\n' % style_loss.eval()) stderr.write(' tv loss: %g\n' % tv_loss.eval()) stderr.write(' total loss: %g\n' % loss.eval()) # optimization best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.global_variables_initializer()) stderr.write('Optimization started...\n') if (print_iterations and print_iterations != 0): print_progress() for i in range(iterations): stderr.write('Iteration %4d/%4d\n' % (i + 1, iterations)) train_step.run() last_step = (i == iterations - 1) if last_step or (print_iterations and i % print_iterations == 0): print_progress() if (checkpoint_iterations and i % checkpoint_iterations == 0) or last_step: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() img_out = vgg.unprocess(best.reshape(shape[1:]), vgg_mean_pixel) if preserve_colors and preserve_colors == True: original_image = np.clip(content_image, 0, 255) styled_image = np.clip(img_out, 0, 255) # Luminosity transfer steps: # 1. Convert stylized RGB->grayscale accoriding to Rec.601 luma (0.299, 0.587, 0.114) # 2. Convert stylized grayscale into YUV (YCbCr) # 3. Convert original image into YUV (YCbCr) # 4. Recombine (stylizedYUV.Y, originalYUV.U, originalYUV.V) # 5. Convert recombined image from YUV back to RGB # 1 styled_grayscale = rgb2gray(styled_image) styled_grayscale_rgb = gray2rgb(styled_grayscale) # 2 styled_grayscale_yuv = np.array( Image.fromarray( styled_grayscale_rgb.astype( np.uint8)).convert('YCbCr')) # 3 original_yuv = np.array( Image.fromarray(original_image.astype( np.uint8)).convert('YCbCr')) # 4 w, h, _ = original_image.shape combined_yuv = np.empty((w, h, 3), dtype=np.uint8) combined_yuv[..., 0] = styled_grayscale_yuv[..., 0] combined_yuv[..., 1] = original_yuv[..., 1] combined_yuv[..., 2] = original_yuv[..., 2] # 5 img_out = np.array( Image.fromarray(combined_yuv, 'YCbCr').convert('RGB')) yield ((None if last_step else i), img_out)
def stylize(network, initial, content, styles, iterations, content_weight, style_weight, style_blend_weights, tv_weight, learning_rate, print_iterations=None, checkpoint_iterations=None): shape = (1,) + content.shape style_shapes = [(1,) + style.shape for style in styles] content_features = {} style_features = [{} for _ in styles] # compute content features in feedforward mode g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=shape) net, mean_pixel = vgg.net(network, image) content_pre = np.array([vgg.preprocess(content, mean_pixel)]) content_features[CONTENT_LAYER] = net[CONTENT_LAYER].eval( feed_dict={image: content_pre}) # compute style features in feedforward mode for i in range(len(styles)): g = tf.Graph() with g.as_default(), g.device('/cpu:0'), tf.Session() as sess: image = tf.placeholder('float', shape=style_shapes[i]) net, _ = vgg.net(network, image) style_pre = np.array([vgg.preprocess(styles[i], mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) print 'Initial feature shape: ', features.shape features = np.reshape(features, (-1, features.shape[3])) #mask = np.zeros_like(features) #mask[:49664/2, :] = 1 #print 'Mask shape', mask.shape print 'Final features shape', features.shape #features = features*mask gram = np.matmul(features.T, features) / features.size print 'Gram matrix shape: ', gram.shape style_features[i][layer] = gram #sys.exit() # make stylized image using backpropogation with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 0.256 else: initial = np.array([vgg.preprocess(initial, mean_pixel)]) initial = initial.astype('float32') image = tf.Variable(initial) net, _ = vgg.net(network, image) # content loss content_loss = content_weight * (2 * tf.nn.l2_loss( net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_features[CONTENT_LAYER].size) # style loss style_loss = 0 for i in range(len(styles)): style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] _, height, width, number = map(lambda i: i.value, layer.get_shape()) print 'Height, width, number', height, width, number size = height * width * number feats = tf.reshape(layer, (-1, number)) #print tf.shape(feats).as_list() print 'Height', height print 'Weight', width print 'Number', number print 'Style features shape', style_features[i][style_layer].shape print style_layer if style_layer == 'relu2_1': mask = np.zeros((height*width, number), dtype=np.float32) temp = imread('emma/emma_test_mask.jpg').astype(np.float32) c = temp.reshape(height,2,width,2) temp = c.max(axis=1).max(axis=2) print temp.shape maskt = np.reshape(temp, (height*width,)) maskt = maskt > 100 for d in xrange(number): mask[:,d] = maskt print 'Mask shape', mask.shape #b = mask.reshape(height*width*2, 2, number/2,2) #mask = b.max(axis=1).max(axis=2) #print 'New mask shape', mask.shape else: mask = np.zeros((height*width, number), dtype=np.float32) maskt = np.reshape(imread('emma/emma_test_mask.jpg').astype(np.float32), (height*width,)) maskt = maskt > 100 for d in xrange(number): mask[:,d] = maskt print 'Mask shape', mask.shape if i == 0: mask = tf.constant(mask) print 'Mask shape', map(lambda i: i.value, mask.get_shape()) feats = tf.mul(feats,mask) gram = tf.matmul(tf.transpose(feats), feats) / size style_gram = style_features[i][style_layer] style_losses.append(2 * tf.nn.l2_loss(gram - style_gram) / style_gram.size) else: mask2 = mask < 1 feats2 = tf.mul(feats,mask2) gram2 = tf.matmul(tf.transpose(feats2), feats2) / size style_gram = style_features[i][style_layer] style_losses.append(2 * tf.nn.l2_loss(gram2 - style_gram) / style_gram.size) style_loss += style_weight * style_blend_weights[i] * reduce(tf.add, style_losses) # total variation denoising tv_y_size = _tensor_size(image[:,1:,:,:]) tv_x_size = _tensor_size(image[:,:,1:,:]) tv_loss = tv_weight * 2 * ( (tf.nn.l2_loss(image[:,1:,:,:] - image[:,:shape[1]-1,:,:]) / tv_y_size) + (tf.nn.l2_loss(image[:,:,1:,:] - image[:,:,:shape[2]-1,:]) / tv_x_size)) # overall loss loss = content_loss + style_loss + tv_loss # optimizer setup train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) def print_progress(i, last=False): if print_iterations is not None: if i is not None and i % print_iterations == 0 or last: print >> stderr, ' content loss: %g' % content_loss.eval() print >> stderr, ' style loss: %g' % style_loss.eval() print >> stderr, ' tv loss: %g' % tv_loss.eval() print >> stderr, ' total loss: %g' % loss.eval() # optimization best_loss = float('inf') best = None with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for i in range(iterations): print_progress(i) print >> stderr, 'Iteration %d/%d' % (i + 1, iterations) train_step.run() if (checkpoint_iterations is not None and i % checkpoint_iterations == 0) or i == iterations - 1: this_loss = loss.eval() if this_loss < best_loss: best_loss = this_loss best = image.eval() print_progress(None, i == iterations - 1) if i % 10 == 0 and best is not None: tmp_img = vgg.unprocess(best.reshape(shape[1:]), mean_pixel) imsave("iter" + str(i) + ".jpg", tmp_img) return vgg.unprocess(best.reshape(shape[1:]), mean_pixel)
def stylize( network, initial, content, style, iterations, content_weight, style_weight, tv_weight, learning_rate, print_iter=None, ): shape = (1,) + content.shape style_shape = (1,) + style.shape content_features = {} style_features = {} g = tf.Graph() with g.as_default(), g.device("/cpu:0"), tf.Session() as sess: image = tf.placeholder("float", shape=shape) net, mean_pixel = vgg.net(network, image) content_pre = np.array([vgg.preprocess(content, mean_pixel)]) content_features[CONTENT_LAYER] = net[CONTENT_LAYER].eval(feed_dict={image: content_pre}) g = tf.Graph() with g.as_default(), g.device("/cpu:0"), tf.Session() as sess: image = tf.placeholder("float", shape=style_shape) net, _ = vgg.net(network, image) style_pre = np.array([vgg.preprocess(style, mean_pixel)]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / (features.size) style_features[layer] = gram with tf.Graph().as_default(): if initial is None: noise = np.random.normal(size=shape, scale=np.std(content) * 0.1) initial = tf.random_normal(shape) * 256 / 1000 else: initial = np.array([vgg.preprocess(initial, mean_pixel)]) initial = initial.astype("float32") image = tf.Variable(initial) net, _ = vgg.net(network, image) content_loss = tf.nn.l2_loss(net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) style_losses = [] for i in STYLE_LAYERS: layer = net[i] _, height, width, number = map(lambda i: i.value, layer.get_shape()) size = height * width * number feats = tf.reshape(layer, (-1, number)) gram = tf.matmul(tf.transpose(feats), feats) / (size) style_gram = style_features[i] style_losses.append(tf.nn.l2_loss(gram - style_gram)) style_loss = reduce(tf.add, style_losses) / len(style_losses) tv_loss = tf.nn.l2_loss(image[:, 1:, :, :] - image[:, : shape[1] - 1, :, :]) + tf.nn.l2_loss( image[:, :, 1:, :] - image[:, :, : shape[2] - 1, :] ) loss = content_weight * content_loss + style_weight * style_loss + tv_weight * tv_loss train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) for i in range(iterations): if print_iter is not None and i % print_iter == 0: print " content loss: %g" % (content_loss.eval()) print " style loss: %g" % (style_loss.eval()) print " tv loss: %g" % (tv_loss.eval()) print " total loss: %g" % loss.eval() print "Iteration %d/%d" % (i + 1, iterations) train_step.run() return vgg.unprocess(image.eval().reshape(shape[1:]), mean_pixel)