Example #1
0
def inference(images,
              train_mode,
              pruned_vgg=False,
              pooling_type="max",
              activation_function="relu"):
    """Build the model up to where it may be used for inference.
    Args:
      images: Images placeholder, from inputs().
    Returns:
      softmax_linear: Output tensor with the computed logits.
    """
    import vgg
    v = vgg.Vgg19(int(images.shape[2]),
                  int(images.shape[1]),
                  activation_function=activation_function)
    if pruned_vgg:
        return v.build_pruned_vgg(images)
    return v.build(images, train_mode, pooling_type=pooling_type)
    def __init__(self, opt):
        self.vocab_size = opt.vocab_size
        self.input_encoding_size = opt.input_encoding_size
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.vocab_size = opt.vocab_size
        self.seq_per_img = opt.seq_per_img
        self.att_hid_size = opt.att_hid_size

        self.opt = opt

        # Variable indicating in training mode or evaluation mode
        self.training = tf.Variable(True, trainable=False, name="training")

        # Input variables
        self.images = tf.placeholder(tf.float32, [None, 224, 224, 3],
                                     name="images")
        self.labels = tf.placeholder(tf.int32, [None, self.seq_length + 2],
                                     name="labels")
        self.masks = tf.placeholder(tf.float32, [None, self.seq_length + 2],
                                    name="masks")

        # Build CNN
        if vars(self.opt).get('start_from', None):
            cnn_weight = None
        else:
            cnn_weight = vars(self.opt).get('cnn_weight', None)
        if self.opt.cnn_model == 'vgg16':
            self.cnn = vgg.Vgg16(cnn_weight)
        if self.opt.cnn_model == 'vgg19':
            self.cnn = vgg.Vgg19(cnn_weight)

        with tf.variable_scope("cnn"):
            self.cnn.build(self.images)

        if self.opt.cnn_model == 'vgg16':
            self.context = self.cnn.conv5_3
        if self.opt.cnn_model == 'vgg19':
            self.context = self.cnn.conv5_4
        self.fc7 = self.cnn.drop7
        self.cnn_training = self.cnn.training

        # Variable in language model
        with tf.variable_scope("rnnlm"):
            # Word Embedding table
            self.Wemb = tf.Variable(tf.random_uniform(
                [self.vocab_size + 1, self.input_encoding_size], -0.1, 0.1),
                                    name='Wemb')

            # RNN cell
            if opt.rnn_type == 'rnn':
                self.cell_fn = cell_fn = tf.contrib.rnn.BasicRNNCell
            elif opt.rnn_type == 'gru':
                self.cell_fn = cell_fn = tf.contrib.rnn.GRUCell
            elif opt.rnn_type == 'lstm':
                self.cell_fn = cell_fn = tf.contrib.rnn.LSTMCell
            else:
                raise Exception("RNN type not supported: {}".format(
                    opt.rnn_type))

            # keep_prob is a function of training flag
            self.keep_prob = tf.cond(
                self.training,
                lambda: tf.constant(1 - self.drop_prob_lm),
                lambda: tf.constant(1.0),
                name='keep_prob')

            # basic cell has dropout wrapper
            self.basic_cell = cell = tf.contrib.rnn.DropoutWrapper(
                cell_fn(self.rnn_size), 1.0, self.keep_prob)
            # cell is the final cell of each timestep
            self.cell = tf.contrib.rnn.MultiRNNCell([cell] * opt.num_layers)
Example #3
0
    """A simplified 2D convolution operation"""
    x = tf.expand_dims(tf.expand_dims(x, 0), -1)
    print x.shape
    y = tf.nn.depthwise_conv2d(x[0, 0], k, [1, 1, 1, 1], padding='SAME')
    return y[0, :, :, 0]


def laplace(x):
    """Compute the 2D laplacian of an array"""
    laplace_k = make_kernel([[0.5, 1.0, 0.5], [1.0, -6., 1.0], [0.5, 1.0,
                                                                0.5]])
    return simple_conv(x, laplace_k)


# initializing the VGG network from pre trained downloaded Data
vgg = vgg.Vgg19()

# getting input images and resizing the style image to content image
content_image = np.asarray(PIL.Image.open("./34.jpg"), dtype=float)
img_width = content_image.shape[0]
img_height = content_image.shape[1]
style_image = np.asarray(PIL.Image.open("./33.jpg"))
style_image = tf.image.resize_images(style_image, size=[img_width, img_height])
b = np.zeros(shape=[1, img_width, img_height, 3])
b[0] = content_image
input_var = tf.clip_by_value(tf.Variable(b, trainable=True, dtype=tf.float32),
                             0.0, 255.0)

# now building the pre trained vgg model graph for style transfer
vgg.build(input_var)
config = tf.ConfigProto()
GPUID = 0
os.environ["CUDA_VISIBLE_DEVICES"] = str(GPUID)

# Parameters
batch_size = 128
epochs = 80
lr = 1e-3

images = tf.placeholder(tf.float32, [batch_size, 224, 224, 3])
audios = tf.placeholder(tf.float32, [batch_size, 1024, 40])
true_out = tf.placeholder(tf.float32, [batch_size, 1000])
train_mode = tf.placeholder(tf.bool)

'''extract image and audio features'''
# VGG = vgg.Vgg19('./tensorflow-vgg/vgg19.npy')
VGG = vgg.Vgg19()
VGG.build(images, train_mode)
imageFeatures = VGG.conv6
audioFeatures = AudioFeature(audios)

''' design loss function (Rank Loss) '''
# S_p
Similarity_matrix = tf.einsum('eabc,ecd->eabd', imageFeatures, tf.transpose(audioFeatures,[0,2,1]))
Similarity_p = tf.reduce_mean(Similarity_matrix, [1,2,3])
#S_j and S_c
Similarity_j = tf.einsum('eabc,ecd->eabd', imageFeatures[::-1], tf.transpose(audioFeatures,[0,2,1]))
Similarity_j = tf.reduce_mean(Similarity_j, [1,2,3])
Similarity_c = tf.einsum('eabc,ecd->eabd', imageFeatures, tf.transpose(audioFeatures[::-1],[0,2,1]))
Similarity_c = tf.reduce_mean(Similarity_c, [1,2,3])

obj_loss = tf.reduce_mean(tf.maximum(Similarity_c - Similarity_p + 1, 0) \
Example #5
0
GPUID = 0
os.environ["CUDA_VISIBLE_DEVICES"] = str(GPUID)

# Parameters
batch_size = 64
epochs = 80
lr = 1e-4

images = tf.placeholder(tf.float32, [batch_size, 224, 224, 3])
audios = tf.placeholder(tf.float32, [batch_size, 1024, 40])
true_out = tf.placeholder(tf.float32, [batch_size, 1000])
train_mode = tf.placeholder(tf.bool)

'''extract image and audio features'''
VGG = vgg.Vgg19('./tensorflow-vgg/vgg19.npy')
# VGG = vgg.Vgg19()
VGG.build(images, train_mode)
imageFeatures = VGG.conv6
audioFeatures = AudioFeature(audios)

''' design loss function (Rank Loss) '''
# S_p
Similarity_matrix = tf.einsum('eabc,ecd->eabd', imageFeatures, tf.transpose(audioFeatures,[0,2,1]))
Similarity_p = tf.reduce_mean(tf.reduce_max(Similarity_matrix, [1,2]), 1)
# S_j 
Similarity_j = tf.einsum('eabc,ecd->eabd', imageFeatures[::-1], tf.transpose(audioFeatures,[0,2,1]))
Similarity_j = tf.reduce_mean(tf.reduce_max(Similarity_j, [1, 2]), 1)
# S_c
Similarity_c = tf.einsum('eabc,ecd->eabd', imageFeatures, tf.transpose(audioFeatures[::-1],[0,2,1]))
Similarity_c = tf.reduce_mean(tf.reduce_max(Similarity_c, [1, 2]), 1)
    def __init__(self, opt):
        self.vocab_size = opt.vocab_size
        self.input_encoding_size = opt.input_encoding_size
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.vocab_size = opt.vocab_size
        self.seq_per_img = opt.seq_per_img
        #self.batch_size = opt.batch_size

        self.opt = opt

        # Variable indicating in training mode or evaluation mode
        self.training = tf.Variable(True, trainable=False, name="training")

        # Input varaibles
        self.images = tf.placeholder(tf.float32, [None, 224, 224, 3],
                                     name="images")
        self.labels = tf.placeholder(tf.int32, [None, self.seq_length + 2],
                                     name="labels")
        self.masks = tf.placeholder(tf.float32, [None, self.seq_length + 2],
                                    name="masks")

        # VGG 16
        if self.opt.start_from is not None:
            cnn_weight = None
        else:
            cnn_weight = self.opt.cnn_weight
        if self.opt.cnn_model == 'vgg16':
            self.cnn = vgg.Vgg16(cnn_weight)
        if self.opt.cnn_model == 'vgg19':
            self.cnn = vgg.Vgg19(cnn_weight)

        with tf.variable_scope("cnn"):
            self.cnn.build(self.images)
        self.fc7 = self.cnn.drop7
        self.cnn_training = self.cnn.training
        """
        # Old model loading
        with open(self.opt.cnn_model) as f:
            fileContent = f.read()
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(fileContent)
            tf.import_graph_def(graph_def, input_map={"images": self.images}, name='vgg16')
            self.vgg16 = tf.get_default_graph()

        self.fc7 = self.vgg16.get_tensor_by_name("vgg16/Relu_1:0")
        """

        # Variable in language model
        with tf.variable_scope("rnnlm"):
            # Word Embedding table
            #with tf.device("/cpu:0"):
            self.Wemb = tf.Variable(tf.random_uniform(
                [self.vocab_size + 1, self.input_encoding_size], -0.1, 0.1),
                                    name='Wemb')

            #
            self.embed_word_W = tf.Variable(tf.random_uniform(
                [self.rnn_size, self.vocab_size + 1], -0.1, 0.1),
                                            name='embed_word_W')
            self.embed_word_b = self.init_bias(self.vocab_size + 1,
                                               name='embed_word_b')

            # RNN cell
            if opt.rnn_type == 'rnn':
                self.cell_fn = cell_fn = tf.nn.rnn_cell.BasicRNNCell
            elif opt.rnn_type == 'gru':
                self.cell_fn = cell_fn = tf.nn.rnn_cell.GRUCell
            elif opt.rnn_type == 'lstm':
                self.cell_fn = cell_fn = tf.nn.rnn_cell.LSTMCell
            else:
                raise Exception("RNN type not supported: {}".format(
                    opt.rnn_type))

            self.keep_prob = tf.cond(
                self.training,
                lambda: tf.constant(1 - self.drop_prob_lm),
                lambda: tf.constant(1.0),
                name='keep_prob')

            self.basic_cell = cell = tf.nn.rnn_cell.DropoutWrapper(
                cell_fn(self.rnn_size, state_is_tuple=True), 1.0,
                self.keep_prob)

            self.cell = tf.nn.rnn_cell.MultiRNNCell([cell] * opt.num_layers,
                                                    state_is_tuple=True)
Example #7
0
    def __init__(self, opt):
        self.opt = opt
        self.vocab_size = opt.vocab_size
        self.input_encoding_size = opt.input_encoding_size
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.rnn_size = opt.rnn_size
        self.seq_per_img = opt.seq_per_img
        self.batch_size = opt.batch_size
        self.seq_per_img = opt.seq_per_img
        self.att_size = opt.att_size
        self.num_boxes = opt.num_boxes

        # Variable indicating in training mode or evaluation mode
        self.training = tf.Variable(True, trainable=False, name="training")
        self.num_layers = 1
        self.cap_iter = opt.cap_iter

        self.b = tf.placeholder(tf.float32, [None, self.num_boxes, 1, 1])
        if self.opt.cnn_model == "frcnn":
            print "using frcnn feature"
            self.cnn_dim = 2048
            self.images = tf.placeholder(tf.float32,
                                         [None, self.num_boxes, self.cnn_dim],
                                         name="features")
            self.context = self.images
        elif self.opt.cnn_model == 'vgg16' or self.opt.cnn_model == 'vgg19':
            print "using cnn model"

            self.images = tf.placeholder(tf.float32, [None, 224, 224, 3],
                                         name="images")
            cnn_weight = vars(self.opt).get('cnn_weight', None)
            if self.opt.cnn_model == 'vgg16':
                import vgg
                self.cnn = vgg.Vgg16(cnn_weight)
                self.cnn_dim = 512
            elif self.opt.cnn_model == 'vgg19':
                import vgg
                self.cnn = vgg.Vgg19(cnn_weight)
                self.cnn_dim = 512
            with tf.variable_scope("cnn"):
                self.cnn.build(self.images)

            if self.opt.cnn_model == 'vgg16':
                self.context = self.cnn.conv5_3
            elif self.opt.cnn_model == 'vgg19':
                self.context = self.cnn.conv5_4
            self.context = tf.reshape(self.context,
                                      [-1, self.num_boxes, self.cnn_dim])
        elif self.opt.cnn_model == 'resnet':
            print "using resnet feature"
            self.cnn_dim = 2048
            self.images = tf.placeholder(tf.float32,
                                         [None, self.num_boxes, self.cnn_dim],
                                         name="features")
            self.context = self.images

        self.labels = tf.placeholder(tf.int32, [None, self.seq_length + 2],
                                     name="labels")

        self.masks = tf.placeholder(tf.float32, [None, self.seq_length + 2],
                                    name="masks")

        with tf.variable_scope("rnnlm"):
            #l2_norm
            self.features = tf.nn.l2_normalize(self.context, axis=-1)

            #self.att_feat = slim.fully_connected(self.features, self.att_size,activation_fn=None, scope='att_feature_proj')

            self.avgFeat = tf.reduce_mean(self.features,
                                          axis=1,
                                          keep_dims=False)
            # Word Embedding table
            self.Wemb = tf.Variable(tf.random_uniform(
                [self.vocab_size, self.input_encoding_size], -0.1, 0.1),
                                    name='Wemb')
            # RNN cell

            if opt.rnn_type == 'rnn':
                self.cell_fn = cell_fn = tf.contrib.rnn.BasicRNNCell
            elif opt.rnn_type == 'gru':
                self.cell_fn = cell_fn = tf.contrib.rnn.GRUCell
            elif opt.rnn_type == 'lstm':
                self.cell_fn = cell_fn = tf.contrib.rnn.LSTMCell
            else:
                raise Exception("RNN type not supported: {}".format(
                    opt.rnn_type))

            self.keep_prob = tf.cond(
                self.training,
                lambda: tf.constant(1 - self.drop_prob_lm),
                lambda: tf.constant(1.0),
                name='keep_prob')

            # basic cell has dropout wrapper
            self.basic_cell1 = cell1 = tf.contrib.rnn.DropoutWrapper(
                cell_fn(self.rnn_size), 1.0, self.keep_prob)
            self.basic_cell2 = cell2 = tf.contrib.rnn.DropoutWrapper(
                cell_fn(self.rnn_size), 1.0, self.keep_prob)
            # cell is the final cell of each timestep
            self.cell1 = tf.contrib.rnn.MultiRNNCell([cell1] * self.num_layers)
            self.cell2 = tf.contrib.rnn.MultiRNNCell([cell2] * self.num_layers)
Example #8
0
    default=0.5)
args = parser.parse_args()

# prepare input images
content_image = load_image(args.content_image_path,
                           scale=float(args.content_scale))
WIDTH, HEIGHT = content_image.shape[1], content_image.shape[0]
content_image = content_image.reshape((1, HEIGHT, WIDTH, 3))
style_image = load_image(args.style_image_path, (WIDTH, HEIGHT))
style_image = style_image.reshape((1, HEIGHT, WIDTH, 3))

# prepare networks
images = np.concatenate((content_image, style_image), 0).astype(np.float32)
constants = tf.constant(images)
with tf.name_scope("constant"):
    vgg_const = vgg.Vgg19()
    vgg_const.build(constants)

# use noise as an initial image
#input_image = tf.Variable(tf.truncated_normal([1, HEIGHT, WIDTH, 3], 0.5, 0.1))
# use content image as an initial image
input_image = tf.Variable(np.expand_dims(images[0, :, :, :], 0))
with tf.name_scope("variable"):
    vgg_var = vgg.Vgg19()
    vgg_var.build(input_image)

# which layers we want to use?
style_layers_const = [
    vgg_const.conv1_1, vgg_const.conv2_1, vgg_const.conv3_1, vgg_const.conv4_1,
    vgg_const.conv5_1
]
Example #9
0
# prepare input images
content_image = load_image(args.content_image_path,
                           scale=float(args.content_scale))
WIDTH, HEIGHT = content_image.shape[1], content_image.shape[0]
content_image = content_image.reshape((1, HEIGHT, WIDTH, 3))
style_image = load_image(args.style_image_path, (WIDTH, HEIGHT))
style_image = style_image.reshape((1, HEIGHT, WIDTH, 3))

print("step0")
# prepare networks
images = np.concatenate((content_image, style_image), 0).astype(np.float32)
constants = tf.constant(images)
print(constants)
with tf.name_scope("constant"):
    print("step0.1")
    vgg_const = vgg.Vgg19("./vgg19.npy")
    print("step0.2")
    vgg_const.build(constants)

print("step1")

# use noise as an initial image
#input_image = tf.Variable(tf.truncated_normal([1, HEIGHT, WIDTH, 3], 0.5, 0.1))
# use content image as an initial image
input_image = tf.Variable(np.expand_dims(images[0, :, :, :], 0))
with tf.name_scope("variable"):
    vgg_var = vgg.Vgg19("./vgg19.npy")
    vgg_var.build(input_image)

print("step2")