Ejemplo n.º 1
0
    def __getitem__(self, idx):
        """
        returns:
            dict (Tensors): contains 'images', 'given_segmentations', 'labels'
        """

        query = self._query_seqs[idx]

        seqname = query.split(' ', 3)[0]

        # We require to begin with a nonempty frame, and will consider all objects in that frame to be tracked.
        # A starting frame is valid if it is followed by seqlen-1 frames with corresp images
        frame_ids = self.get_frame_ids(seqname)
        viable_starting_frame_ids = [
            idx for idx in self.get_nonempty_frame_ids(seqname)
            if idx <= frame_ids[-self._seqlen]
        ]

        frame_ids = self._select_frame_ids(frame_ids,
                                           viable_starting_frame_ids)

        images = torch.stack([
            self._image_read(self._full_image_path(seqname, idx))
            for idx in frame_ids
        ])
        segannos = torch.stack([
            self._anno_read(self._full_anno_path(seqname, idx))
            for idx in frame_ids
        ])

        if self._joint_transform is not None:
            images, segannos = self._joint_transform(images, segannos)

        # try:
        #     segannos = self._select_object_ids(segannos)
        # except:
        #     print(seqname)
        #     print("frame ids                ", self.get_frame_ids(seqname))
        #     print("frame ids post filtering ", frame_ids)
        #     print("viable starting frame ids", viable_starting_frame_ids)
        #     print("visible objects", self._visible_objects[seqname])
        #     raise

        object_name = int(query.split(' ', 3)[1])
        segannos_one = To_onehot(segannos, object_name)

        segannos_all = To_allhot(segannos)

        sentence = query.split(' ', 3)[3]
        txt = np.array(
            text_processing.preprocess_sentence(sentence, self.vocab_dict, 20))
        # txt = np.tile(txt, self._seqlen).reshape(self._seqlen, -1)

        return {
            'images': images,
            'segannos': segannos_one,
            'segannos_all': segannos_all,
            'sentence': txt
        }
Ejemplo n.º 2
0
    def __getitem__(self, index):
        sent = self.qdb[index][0]
        sentence = text_p.preprocess_sentence(sent, self.vocab_file,
                                              self.word_count)
        target = self.qdb[index][1]

        sentence = np.array(sentence).astype(np.int64)
        target = np.array(target).astype(np.float32)

        return sentence, target
Ejemplo n.º 3
0
    def __getitem__(self, idx):
        """
        returns:
            dict (Tensors): contains 'images', 'given_segmentations', 'labels'
        """

        query = self._query_seqs[idx]

        #        assert self._version == '2017', "Only the 2017 version is supported for training as of now"
        seqname = query.split(' ', 2)[0]

        # We require to begin with a nonempty frame, and will consider all objects in that frame to be tracked.
        # A starting frame is valid if it is followed by seqlen-1 frames with corresp images
        frame_ids = self.get_frame_ids(seqname)
        viable_starting_frame_ids = [
            idx for idx in self.get_nonempty_frame_ids(seqname)
            if idx <= frame_ids[-self._seqlen]
        ]

        frame_ids = self._select_frame_ids(frame_ids,
                                           viable_starting_frame_ids)

        images = torch.stack([
            self._image_read(self._full_image_path(seqname, idx))
            for idx in frame_ids
        ])
        segannos = torch.stack([
            self._anno_read(self._full_anno_path(seqname, idx))
            for idx in frame_ids
        ])

        if self._joint_transform is not None:
            images, segannos = self._joint_transform(images, segannos)

        object_name = int(query.split(' ', 2)[1])
        segannos_one = To_onehot(segannos, object_name)

        segannos_all = To_allhot(segannos)

        sentence = query.split(' ', 2)[2].split('"')[1]
        txt = np.array(
            text_processing.preprocess_sentence(sentence, self.vocab_dict, 20))
        # txt = np.tile(txt, self._seqlen).reshape(self._seqlen, -1)

        # return {'images': images, 'provides_seganno': provides_seganno, 'given_seganno': given_seganno,
        #         'segannos': segannos, 'sentence': sentence}

        return {
            'images': images,
            'segannos': segannos_one,
            'segannos_all': segannos_all,
            'sentence': txt
        }
Ejemplo n.º 4
0
    def get_video_generator(self):

        for query in self._query_seqs:
            seqname = query.split(' ', 2)[0]
            object_name = int(query.split(' ', 2)[1])

            sentence = query.split(' ', 2)[2].split('"')[1]
            txt = np.array(
                text_processing.preprocess_sentence(sentence, self.vocab_dict,
                                                    20))

            if seqname in self._all_seqs:
                yield (seqname, self._get_video(seqname, object_name, txt))
Ejemplo n.º 5
0
    def get_video_generator(self, low=0, high=2**31):
        """Returns a video generator. The video generator is used to obtain parts of a sequence. Some assumptions are made, depending on whether the train or valid splits are used. For the train split, the first annotated frame is given. No other annotation is used. For the validation split, each annotation found is given.
        """
        for query in self._query_seqs:
            seqname = query.split(' ', 2)[0]
            object_name = int(query.split(' ', 2)[1])

            sentence = query.split(' ', 2)[2].split('"')[1]
            txt = np.array(
                text_processing.preprocess_sentence(sentence, self.vocab_dict,
                                                    20))

            if seqname in self._all_seqs:
                yield (seqname, self._get_video(seqname, object_name, txt))
Ejemplo n.º 6
0
 def __getitem__(self, index):
     sent = self.qdb[index]
     sentence = text_p.preprocess_sentence(sent, self.vocab_file,
                                           self.word_count)
     sentence = np.array(sentence).astype(np.int64)
     return sentence
def data_preparation(**kwargs):
    dataset = kwargs['dataset']
    data_base_dir = kwargs['data_base_dir']
    text_len = kwargs['text_len']

    if dataset == 'both':
        dataset_types = ['train', 'val']
    else:
        dataset_types = [dataset]

    caption_data_base_dir = os.path.join(data_base_dir, 'captions')
    image_data_base_dir = os.path.join(data_base_dir, 'images')

    categories = os.listdir(caption_data_base_dir)
    categories.sort()

    vocab_file = os.path.join(data_base_dir, 'vocab.txt')
    vocab_dict = load_vocab_dict_from_file(vocab_file)

    for dataset_type in dataset_types:
        data_save_split_base = os.path.join(data_base_dir, 'tfrecord',
                                            dataset_type)
        os.makedirs(data_save_split_base, exist_ok=True)

        for category_id, category_name in enumerate(categories):
            record_filename = os.path.join(data_save_split_base,
                                           category_name + '.tfrecord')

            with tf.python_io.TFRecordWriter(
                    record_filename) as tfrecord_writer:
                json_file_path = os.path.join(caption_data_base_dir,
                                              category_name,
                                              dataset_type + '.json')
                fp = open(json_file_path, "r")
                json_data = fp.read()
                json_data = json.loads(json_data)
                nImgs = len(json_data)
                print(dataset_type, category_name, nImgs)

                for j in range(nImgs):
                    image_name = json_data[j]['key']

                    cartoon_path = os.path.join(image_data_base_dir,
                                                category_name, 'cartoon',
                                                image_name)
                    sketch_path = os.path.join(image_data_base_dir,
                                               category_name, 'edgemap',
                                               image_name)

                    cartoon_image = Image.open(cartoon_path)
                    cartoon_image = cartoon_image.convert("RGB")
                    cartoon_image = np.array(
                        cartoon_image, dtype=np.uint8)  # shape = [H, W, 3]
                    cartoon_image_raw = cartoon_image.tobytes()

                    sketch_image = Image.open(sketch_path)
                    sketch_image = sketch_image.convert("RGB")
                    sketch_image = np.array(
                        sketch_image, dtype=np.uint8)  # shape = [H, W, 3]
                    sketch_image_raw = sketch_image.tobytes()

                    color_text = json_data[j]['color_text']
                    vocab_indices = preprocess_sentence(
                        color_text, vocab_dict, text_len)  # list
                    vocab_indices_raw = np.array(
                        vocab_indices, dtype=np.uint8).tobytes()  # [15]
                    # print(color_text)
                    # vocab_indices_display = [item + 1 for item in vocab_indices]
                    # print(vocab_indices_display)

                    example = _to_tfexample_raw(
                        image_name.encode(), cartoon_image_raw,
                        sketch_image_raw, category_name.encode(), category_id,
                        color_text.encode(), vocab_indices_raw)
                    tfrecord_writer.write(example.SerializeToString())
Ejemplo n.º 8
0
def inference(img_name, instruction):
    wild_data_base_dir = 'examples'
    wild_text = instruction

    wild_cate = img_name[:img_name.find('.png')]

    SIZE = {True: (64, 64), False: (192, 192)}
    T = 15  # the longest length of text
    vocab_file = 'data/vocab.txt'

    captions_base_dir = os.path.join('data', 'captions')
    categories = os.listdir(captions_base_dir)
    categories.sort()

    if wild_cate not in categories:
        wild_cate = categories[2]

    # Roll out the parameters
    batch_size = 1
    ckpt_dir = Config.ckpt_dir
    results_dir = Config.results_dir
    data_format = Config.data_format
    distance_map = Config.distance_map
    small_img = Config.small_img
    LSTM_hybrid = Config.LSTM_hybrid
    block_type = Config.block_type
    vocab_size = Config.vocab_size

    distance_map = distance_map != 0
    small = small_img != 0
    LSTM_hybrid = LSTM_hybrid != 0

    img_dim = SIZE[small]

    output_folder = results_dir
    print('output_folder:', output_folder)
    os.makedirs(output_folder, exist_ok=True)

    vocab_dict = load_vocab_dict_from_file(vocab_file)

    input_images = tf.placeholder(tf.float32,
                                  shape=[1, 3, img_dim[0],
                                         img_dim[1]])  # [1, 3, H, W]
    class_ids = tf.placeholder(tf.int32, shape=(1, ))  # (1, )
    text_vocab_indiceses = tf.placeholder(tf.int32, shape=[1, 15])  # [1, 15]

    ret_list = build_single_graph(
        input_images,
        input_images,
        None,
        class_ids,
        None,
        text_vocab_indiceses,
        batch_size=batch_size,
        training=False,
        LSTM_hybrid=LSTM_hybrid,
        vocab_size=vocab_size,
        data_format=data_format,
        distance_map=distance_map,
        block_type=block_type)  # [image_gens, images, sketches]

    snapshot_loader = tf.train.Saver()

    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        print('Restore trained model:', tf.train.latest_checkpoint(ckpt_dir))
        snapshot_loader.restore(sess, tf.train.latest_checkpoint(ckpt_dir))

        input_name = img_name
        input_category = wild_cate  # e.g. 'bus'
        input_text = wild_text  # e.g. 'A yellow bus with blue window'

        sketch_path = os.path.join(wild_data_base_dir, input_name)
        sketch_image = Image.open(sketch_path)
        sketch_image = sketch_image.convert("RGB")
        # Resize
        if sketch_image.width != img_dim[0] or sketch_image.height != img_dim[
                1]:
            margin_size = 0 if input_category in ['road'] else 10
            sketch_image = resize_and_padding_mask_image(
                sketch_image, img_dim[0],
                margin_size=margin_size).astype(np.float32)
        else:
            sketch_image = np.array(sketch_image,
                                    dtype=np.float32)  # shape = [H, W, 3]

        # Normalization
        sketch_image = sketch_image / 255.
        sketch_image = sketch_image * 2. - 1

        sketch_image = np.expand_dims(sketch_image,
                                      axis=0)  # shape = [1, H, W, 3]
        sketch_image = np.transpose(sketch_image,
                                    [0, 3, 1, 2])  # shape = [1, 3, H, W]

        class_id = categories.index(input_category)
        class_id = np.array([class_id])

        vocab_indices = preprocess_sentence(input_text, vocab_dict, T)  # list
        vocab_indices = np.array(vocab_indices, dtype=np.int32)
        vocab_indices = np.expand_dims(vocab_indices,
                                       axis=0)  # shape = [1, 15]

        try:
            # print('class_id', class_id)
            # print('vocab_indices', vocab_indices)
            generated_img, _, input_sketch = sess.run(
                [ret_list[0], ret_list[1], ret_list[2]],
                feed_dict={
                    input_images: sketch_image,
                    class_ids: class_id,
                    text_vocab_indiceses: vocab_indices
                })
        except Exception as e:
            print(e.args)

        if data_format == 'NCHW':
            generated_img = np.transpose(generated_img, (0, 2, 3, 1))
            input_sketch = np.transpose(input_sketch, (0, 2, 3, 1))

        # log('before, generated_img', generated_img)
        # log('before, input_sketch', input_sketch)
        generated_img = ((generated_img + 1) / 2.) * 255
        input_sketch = ((input_sketch + 1) / 2.) * 255
        generated_img = generated_img[:, :, :, ::-1].astype(np.uint8)
        input_sketch = input_sketch.astype(np.uint8)
        # log('after, generated_img', generated_img)
        # log('after, input_sketch', input_sketch)

        img_out_filename = input_name[:-4] + '_output.png'
        sketch_in_filename = input_name[:-4] + '_input.png'

        # Save file
        cv2.imwrite(os.path.join(output_folder, img_out_filename),
                    generated_img[0])
        cv2.imwrite(os.path.join(output_folder, sketch_in_filename),
                    input_sketch[0])

        print('Saved file %s' % img_out_filename)