Beispiel #1
0
    def __init__(self, version: str='2.0', num_parallel_reads: Optional[int]=None, force_rebuild=False, nohashcheck=False) -> None:

        self.num_parallel_reads = num_parallel_reads
        self.num_val_examples = None
        self.num_train_examples = None

        if version == '2.0':
            self.training_data_json_key = maybe_download_and_store_single_file(
                url='https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json', key='squad/train_json')
            self.dev_data_json_key = maybe_download_and_store_single_file(
                url='https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json', key='squad/dev_json')

            # Load the JSON from the files
            with open(DATA_STORE[self.training_data_json_key], 'r') as train_json:
                self.train_json = json.loads(train_json.read())
            with open(DATA_STORE[self.dev_data_json_key], 'r') as dev_json:
                self.dev_json = json.loads(dev_json.read())

            # Setup some baked constants in the dataset
            self.mwl = 766
            self.mcl = 37
            self.mql = 30

            # Parse the JSON
            if not force_rebuild and DATA_STORE.is_valid('squad/dictionary', nohashcheck=nohashcheck):
                with open(DATA_STORE['squad/dictionary'], 'rb') as pkl_file:
                    self.dictionary = pickle.load(pkl_file)
            else:
                self.dictionary = NLPDictionary()

            # Build the training set if necessary
            self.num_train_examples = self.build_dataset(train=True, force_rebuild=force_rebuild, nohashcheck=nohashcheck)
            if self.num_train_examples is None or self.num_train_examples == 0:
                self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['squad/tfrecord/train']))

            # Build the validation set if necessary
            self.num_val_examples = self.build_dataset(train=False, force_rebuild=force_rebuild, nohashcheck=nohashcheck)
            if self.num_val_examples is None or self.num_val_examples == 0:
                self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['squad/tfrecord/dev']))

            self.train_fpath = DATA_STORE['squad/tfrecord/train']
            self.dev_fpath = DATA_STORE['squad/tfrecord/dev']

            # Save the dictionary
            with open(DATA_STORE.create_key('squad/dictionary', 'dict.pkl', force=True), 'wb') as pkl_file:
                pickle.dump(self.dictionary, pkl_file)
                DATA_STORE.update_hash('squad/dictionary')

            self.word_vocab_size = len(self.dictionary.word_dictionary)
            self.char_vocab_size = len(self.dictionary.char_dictionary)

            self._val_db = None
            self._train_db = None
        else:
            raise NotImplementedError(
                "Only version 2.0 is currently supported")
Beispiel #2
0
    def __init__(self,
                 num_parallel_reads=1,
                 sample_only=True,
                 force_rebuild=True,
                 nohashcheck=True,
                 subset="en",
                 wml=8,
                 cml=10):
        self.task_root = "tasks_1-20_v1-2"
        self.subset = subset
        url = "http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz"
        self.keys = maybe_download_and_store_tar(url=url,
                                                 root_key=self.task_root)
        self.nlp_dict = NLPDictionary()
        self.wml = wml
        self.cml = cml
        self.num_parallel_reads = num_parallel_reads

        dict_name = self.task_root + "/dictionary"
        self.train_record_root = 'tasks_1-20_v1-2/tfrecord/train'
        self.val_record_root = 'tasks_1-20_v1-2/tfrecord/dev'

        if not force_rebuild and DATA_STORE.is_valid(dict_name,
                                                     nohashcheck=nohashcheck):
            with open(DATA_STORE[dict_name], 'rb') as pkl_file:
                self.dictionary = pickle.load(pkl_file)
        else:
            self.dictionary = NLPDictionary()

        # Build the training set if necessary
        self.sample_num_train_examples = self.build_dataset(
            train=True,
            sample=True,
            force_rebuild=force_rebuild,
            nohashcheck=nohashcheck)
        # if self.num_train_examples is None:
        #     self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.train_record_root]))

        # Build the validation set if necessary
        self.sample_num_val_examples = self.build_dataset(
            train=False,
            sample=True,
            force_rebuild=force_rebuild,
            nohashcheck=nohashcheck)
        # if self.num_val_examples is None:
        #     self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.val_record_root]))
        # Build the training set if necessary
        self.num_train_examples = self.build_dataset(
            train=True,
            sample=False,
            force_rebuild=force_rebuild,
            nohashcheck=nohashcheck)
        # if self.num_train_examples is None:
        #     self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.train_record_root]))

        # Build the validation set if necessary
        self.num_val_examples = self.build_dataset(train=False,
                                                   sample=False,
                                                   force_rebuild=force_rebuild,
                                                   nohashcheck=nohashcheck)
        # if self.num_val_examples is None:
        #     self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.val_record_root]))
        #  = DATA_STORE[self.train_record_root]
        # self.dev_fpath = DATA_STORE[self.val_record_root]

        # Save the dictionary
        with open(DATA_STORE.create_key(dict_name, 'dict.pkl', force=True),
                  'wb') as pkl_file:
            pickle.dump(self.dictionary, pkl_file)
            DATA_STORE.update_hash(dict_name)

        self.word_vocab_size = len(self.dictionary.word_dictionary)
        self.char_vocab_size = len(self.dictionary.char_dictionary)

        self._sample_val_db = None
        self._sample_train_db = None
        self._train_db = None
        self._val_db = None

        #TODO: Add Shuffle Dataset if necessary.

        print("Build Complete.")
Beispiel #3
0
class bAbI_20:
    task_list = [
        'qa1_single-supporting-fact',
        'qa2_two-supporting-facts',
        'qa3_three-supporting-facts',
        'qa4_two-arg-relations',
        'qa5_three-arg-relations',
        'qa6_yes-no-questions',
        'qa7_counting',
        'qa8_lists-sets',
        'qa9_simple-negation',
        'qa10_indefinite-knowledge',
        'qa11_basic-coreference',
        'qa12_conjunction',
        'qa13_compound-coreference',
        'qa14_time-reasoning',
        'qa15_basic-deduction',
        'qa16_basic-induction',
        'qa17_positional-reasoning',
        'qa18_size-reasoning',
        'qa19_path-finding',
        'qa20_agents-motivations',
    ]

    def __init__(self,
                 num_parallel_reads=1,
                 sample_only=True,
                 force_rebuild=True,
                 nohashcheck=True,
                 subset="en",
                 wml=8,
                 cml=10):
        self.task_root = "tasks_1-20_v1-2"
        self.subset = subset
        url = "http://www.thespermwhale.com/jaseweston/babi/tasks_1-20_v1-2.tar.gz"
        self.keys = maybe_download_and_store_tar(url=url,
                                                 root_key=self.task_root)
        self.nlp_dict = NLPDictionary()
        self.wml = wml
        self.cml = cml
        self.num_parallel_reads = num_parallel_reads

        dict_name = self.task_root + "/dictionary"
        self.train_record_root = 'tasks_1-20_v1-2/tfrecord/train'
        self.val_record_root = 'tasks_1-20_v1-2/tfrecord/dev'

        if not force_rebuild and DATA_STORE.is_valid(dict_name,
                                                     nohashcheck=nohashcheck):
            with open(DATA_STORE[dict_name], 'rb') as pkl_file:
                self.dictionary = pickle.load(pkl_file)
        else:
            self.dictionary = NLPDictionary()

        # Build the training set if necessary
        self.sample_num_train_examples = self.build_dataset(
            train=True,
            sample=True,
            force_rebuild=force_rebuild,
            nohashcheck=nohashcheck)
        # if self.num_train_examples is None:
        #     self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.train_record_root]))

        # Build the validation set if necessary
        self.sample_num_val_examples = self.build_dataset(
            train=False,
            sample=True,
            force_rebuild=force_rebuild,
            nohashcheck=nohashcheck)
        # if self.num_val_examples is None:
        #     self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.val_record_root]))
        # Build the training set if necessary
        self.num_train_examples = self.build_dataset(
            train=True,
            sample=False,
            force_rebuild=force_rebuild,
            nohashcheck=nohashcheck)
        # if self.num_train_examples is None:
        #     self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.train_record_root]))

        # Build the validation set if necessary
        self.num_val_examples = self.build_dataset(train=False,
                                                   sample=False,
                                                   force_rebuild=force_rebuild,
                                                   nohashcheck=nohashcheck)
        # if self.num_val_examples is None:
        #     self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.val_record_root]))
        #  = DATA_STORE[self.train_record_root]
        # self.dev_fpath = DATA_STORE[self.val_record_root]

        # Save the dictionary
        with open(DATA_STORE.create_key(dict_name, 'dict.pkl', force=True),
                  'wb') as pkl_file:
            pickle.dump(self.dictionary, pkl_file)
            DATA_STORE.update_hash(dict_name)

        self.word_vocab_size = len(self.dictionary.word_dictionary)
        self.char_vocab_size = len(self.dictionary.char_dictionary)

        self._sample_val_db = None
        self._sample_train_db = None
        self._train_db = None
        self._val_db = None

        #TODO: Add Shuffle Dataset if necessary.

        print("Build Complete.")

    def read_txt(self, filename):
        with open(filename, "r") as r:
            return r.read()

    def read_file_from_db(self, is_train, task_key, sample=True):
        if sample and not is_train:
            task_path = task_key + "_test"
        else:
            task_path = task_key + "_train" if is_train else task_key

        if not DATA_STORE.is_valid(task_path):
            raise NameError("{0} does not exist.".format(task_path))
        return DATA_STORE.get_file(task_path)["fpath"]

    def build_dataset(self,
                      train,
                      sample=True,
                      force_rebuild=False,
                      nohashcheck=False):
        num_tasks = 0
        record_root = self.train_record_root if train else self.val_record_root
        record_name = "sample.tfrecords" if sample else "data.tfrecords"
        subset = self.subset

        if not train:
            subset = subset + "-valid"

        if not sample:
            subset = subset + "-10k"

        if force_rebuild:
            log_message('Building dataset ({})...'.format(
                'Train' if train else 'Valid'))

            task_path = "{0}/{1}/{2}/{3}"

            for task in tqdm.tqdm(bAbI_20.task_list):
                if not train:
                    task_id = task.split("_")[0]
                else:
                    task_id = task
                task_tf_root = os.path.join(record_root, subset, task_id)

                tf_record_writer = tf.python_io.TFRecordWriter(
                    DATA_STORE.create_key(task_tf_root,
                                          record_name,
                                          force=force_rebuild))

                task_path = task_path.format(self.task_root, self.task_root,
                                             subset, task_id)
                data_path = self.read_file_from_db(train, task_path)

                txt = self.read_txt(data_path)
                features = self.parse_context_question(txt)

                for feature_dict in features:
                    example = tf.train.Example(\
                                    features=tf.train.Features(feature=feature_dict))
                    tf_record_writer.write(example.SerializeToString())
                tf_record_writer.close()
                DATA_STORE.update_hash(task_tf_root)
                num_tasks += 1
        return num_tasks

    def parse_context_question(self, text: str) -> List:
        context = ([], [], [])
        question = ([], [], [])
        answer = ([], [], [])
        source = []
        features = []

        for i, eachline in enumerate(text.split("\n")):
            # use enumerate to double check there is no
            # special lines
            j = (i % UNIT_SIZE) + 1
            parsed_line = eachline.split(" ")
            if len(parsed_line) < 2:
                continue
            index = int(parsed_line[0])
            assert index == j
            text = " ".join(parsed_line[1:])
            if index % QA == 0:
                q, a, src = text.split("\t")
                q_dense, q_len = self.nlp_dict.dense_parse(
                    q, word_padding=self.wml, char_padding=self.cml)
                a_dense, a_len = self.nlp_dict.dense_parse(
                    a, word_padding=self.wml, char_padding=self.cml)
                c_dense, c_len = self.nlp_dict.dense_parse(
                    "", word_padding=self.wml, char_padding=self.cml)

                question[0].append(q_dense[0])
                question[1].append(q_dense[1])
                question[2].append(q_len)

                answer[0].append(a_dense[0])
                answer[1].append(a_dense[1])
                answer[2].append(a_len)

                source.append(src)
            else:
                c_dense, c_len = self.nlp_dict.dense_parse(
                    text, word_padding=self.wml, char_padding=self.cml)

            context[0].append(c_dense[0])
            context[1].append(c_dense[1])
            context[2].append(c_len)

            if index == UNIT_SIZE:
                context = self._to_np_array(context)
                question = self._to_np_array(question)
                answer = self._to_np_array(answer)

                source = np.array(source)

                feature_dict = self.build_feature_dict(context, question,
                                                       answer, source)

                context = ([], [], [])
                question = ([], [], [])
                answer = ([], [], [])
                source = []

                features.append(feature_dict)
        return features

    def build_feature_dict(self, context, question, answer, source):
        context_dense, context_char, c_len = context
        question_dense, question_char, q_len = question
        answer_dense, answer_char, a_len = answer
        feature_dict = {}
        feature_dict['context_word_embedding'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=context_dense.flatten()))
        feature_dict['context_char_embedding'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=context_char.flatten()))
        feature_dict['question_word_embedding'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=question_dense.flatten()))
        feature_dict['question_char_embedding'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=question_char.flatten()))
        feature_dict['answer_word_embedding'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=answer_dense.flatten()))
        feature_dict['answer_char_embedding'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=answer_char.flatten()))

        feature_dict['word_maxlen'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=[self.wml]))
        feature_dict['char_maxlen'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=[self.cml]))

        feature_dict['context_word_len'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=c_len))
        feature_dict['question_word_len'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=q_len))
        feature_dict['answer_word_len'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=a_len))
        return feature_dict

    def _to_np_array(self, t: Tuple):
        return tuple([np.array(i) for i in t])

    def build_db(self, sample, is_train, db, subset) -> Dict:
        record_root = self.train_record_root if is_train else self.val_record_root
        if db is not None:
            return db
        task_dict = {}
        record_name = "sample.tfrecords" if sample else "data.tfrecords"
        for task in bAbI_20.task_list:
            if not is_train:
                task = task.split("_")[0]
            task_tf_root = os.path.join(record_root, subset, task)

            if not DATA_STORE.is_valid(task_tf_root):
                raise NotImplementedError("{} not built".format(task_tf_root))

            task_dict[task] = tf.data.TFRecordDataset(
                DATA_STORE[task_tf_root],
                num_parallel_reads=self.num_parallel_reads).map(self._map_fn)

        return task_dict

    @property
    def sample_train_db(self, ) -> Dict:
        self._sample_train_db = self.build_db(True, True,
                                              self._sample_train_db,
                                              self.subset)
        return self._sample_train_db

    @property
    def sample_val_db(self, ) -> Dict:
        self._sample_val_db = self.build_db(True, False, self._sample_val_db,
                                            self.subset + "-valid")
        return self._sample_val_db

    @property
    def train_db(self, ) -> Dict:
        self._train_db = self.build_db(False, True, self._train_db,
                                       self.subset + "-10k")
        return self._train_db

    @property
    def val_db(self, ) -> Dict:
        self._val_db = self.build_db(False, False, self._val_db,
                                     self.subset + "-valid-10k")
        return self._val_db

    def _map_fn(self, serialized_example):
        feature_dict = {}
        feature_dict['context_word_embedding'] = tf.FixedLenFeature(
            [UNIT_SIZE, self.wml], tf.int64)
        #     feature_dict['context_char_embedding'] = tf.FixedLenFeature([15, mwl, mcl], tf.int64)
        feature_dict['question_word_embedding'] = tf.FixedLenFeature(
            [QA, self.wml], tf.int64)
        #     feature_dict['question_char_embedding'] = tf.FixedLenFeature([5, mwl, mcl], tf.int64)
        feature_dict['answer_word_embedding'] = tf.FixedLenFeature(
            [QA, self.wml], tf.int64)
        #     feature_dict['answer_char_embedding'] = tf.FixedLenFeature([5, mwl, mcl], tf.int64)

        feature_dict['word_maxlen'] = tf.FixedLenFeature([], tf.int64)
        #     feature_dict['char_maxlen'] = tf.FixedLenFeature([], tf.int64)
        feature_dict['context_word_len'] = tf.FixedLenFeature([UNIT_SIZE],
                                                              tf.int64)
        feature_dict['question_word_len'] = tf.FixedLenFeature([QA], tf.int64)
        feature_dict['answer_word_len'] = tf.FixedLenFeature([QA], tf.int64)


        features = tf.parse_single_example(\
                                            serialized_example,\
                                        features=feature_dict)

        cwe = features["context_word_embedding"]
        qwe = features["question_word_embedding"]
        awe = features["answer_word_embedding"]

        wml = features["word_maxlen"]
        cwl = features["context_word_len"]
        qwl = features["question_word_len"]
        awl = features["answer_word_len"]
        return cwe, qwe, awe, wml, cwl, qwl, awl
Beispiel #4
0
class Squad():

    def __init__(self, version: str='2.0', num_parallel_reads: Optional[int]=None, force_rebuild=False, nohashcheck=False) -> None:

        self.num_parallel_reads = num_parallel_reads
        self.num_val_examples = None
        self.num_train_examples = None

        if version == '2.0':
            self.training_data_json_key = maybe_download_and_store_single_file(
                url='https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json', key='squad/train_json')
            self.dev_data_json_key = maybe_download_and_store_single_file(
                url='https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json', key='squad/dev_json')

            # Load the JSON from the files
            with open(DATA_STORE[self.training_data_json_key], 'r') as train_json:
                self.train_json = json.loads(train_json.read())
            with open(DATA_STORE[self.dev_data_json_key], 'r') as dev_json:
                self.dev_json = json.loads(dev_json.read())

            # Setup some baked constants in the dataset
            self.mwl = 766
            self.mcl = 37
            self.mql = 30

            # Parse the JSON
            if not force_rebuild and DATA_STORE.is_valid('squad/dictionary', nohashcheck=nohashcheck):
                with open(DATA_STORE['squad/dictionary'], 'rb') as pkl_file:
                    self.dictionary = pickle.load(pkl_file)
            else:
                self.dictionary = NLPDictionary()

            # Build the training set if necessary
            self.num_train_examples = self.build_dataset(train=True, force_rebuild=force_rebuild, nohashcheck=nohashcheck)
            if self.num_train_examples is None or self.num_train_examples == 0:
                self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['squad/tfrecord/train']))

            # Build the validation set if necessary
            self.num_val_examples = self.build_dataset(train=False, force_rebuild=force_rebuild, nohashcheck=nohashcheck)
            if self.num_val_examples is None or self.num_val_examples == 0:
                self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['squad/tfrecord/dev']))

            self.train_fpath = DATA_STORE['squad/tfrecord/train']
            self.dev_fpath = DATA_STORE['squad/tfrecord/dev']

            # Save the dictionary
            with open(DATA_STORE.create_key('squad/dictionary', 'dict.pkl', force=True), 'wb') as pkl_file:
                pickle.dump(self.dictionary, pkl_file)
                DATA_STORE.update_hash('squad/dictionary')

            self.word_vocab_size = len(self.dictionary.word_dictionary)
            self.char_vocab_size = len(self.dictionary.char_dictionary)

            self._val_db = None
            self._train_db = None
        else:
            raise NotImplementedError(
                "Only version 2.0 is currently supported")

    def build_dataset(self, train, force_rebuild=False, nohashcheck=False):
        record_root = 'squad/tfrecord/train' if train else 'squad/tfrecord/dev'
        json_data = self.train_json['data'] if train else self.dev_json['data']
        num_errors = 0
        num_documents = 0

        if force_rebuild or not DATA_STORE.is_valid(record_root, nohashcheck=nohashcheck):
            log_message('Building dataset ({})...'.format('Train' if train else 'Valid'))
            tf_record_writer = tf.python_io.TFRecordWriter(
                DATA_STORE.create_key(record_root, 'data.tfrecords',force=force_rebuild))
            for article in tqdm.tqdm(json_data):
                for paragraph_json in article['paragraphs']:

                    # Compute the context embedding
                    context_tokens = self.dictionary.tokenizer.parse(paragraph_json['context'].strip().replace('\n', ''))
                    context_dense, context_len = self.dictionary.dense_parse_tokens(context_tokens, word_padding=self.mwl, char_padding=self.mcl)

                    # Compute the QA embeddings
                    for question_answer in paragraph_json['qas']:
                        question_dense, question_len = self.dictionary.dense_parse(
                            question_answer['question'].strip().replace('\n', ''), word_padding=self.mql, char_padding=self.mcl)

                        # For each answer
                        for answer in question_answer['answers']:
                            answer_dense, answer_len = self.dictionary.dense_parse(
                                answer['text'], word_padding=self.mql, char_padding=self.mcl)

                            # Character span start/end
                            span_start = answer['answer_start']
                            span_end = span_start + len(answer['text'])

                            # Get the token span from the char span
                            token_span_start, token_span_end = get_token_span_from_char_span(
                                paragraph_json['context'].strip().replace('\n', ''), context_tokens, span_start, span_end)

                            if token_span_start < 0 or token_span_end < 0:
                                num_errors += 1
                                break

                            # Now that we've got the contents, let's make a TF-Record
                            # We're going to handle the tf-record writing here for now
                            # TODO: Move the tf-record writing to it's own file
                            feature_dict = self.build_feature_dict(context_dense, question_dense, answer_dense, span_start, span_end, token_span_start, token_span_end, context_len, question_len, answer_len)

                            example = tf.train.Example(
                                features=tf.train.Features(feature=feature_dict))
                            tf_record_writer.write(
                                example.SerializeToString())
                            num_documents += 1
            tf_record_writer.close()
            DATA_STORE.update_hash(record_root)
        return num_documents

    def build_feature_dict(self, context_dense, question_dense, answer_dense, span_start, span_end, token_span_start, token_span_end, context_len, question_len, answer_len):
        # Create the feature dictionary
        feature_dict = {}
        feature_dict['context_word_embedding'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=context_dense[0].flatten()))
        feature_dict['context_char_embedding'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=context_dense[1].flatten()))
        feature_dict['question_word_embedding'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=question_dense[0].flatten()))
        feature_dict['question_char_embedding'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=question_dense[1].flatten()))
        feature_dict['answer_word_embedding'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=answer_dense[0].flatten()))
        feature_dict['answer_char_embedding'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=answer_dense[1].flatten()))
        feature_dict['word_maxlen'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=[self.mwl]))
        feature_dict['char_maxlen'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=[self.mcl]))
        feature_dict['span_start'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=[span_start]))
        feature_dict['span_end'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=[span_end]))
        feature_dict['token_span_start'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=[token_span_start]))
        feature_dict['token_span_end'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=[token_span_end]))
        feature_dict['context_word_len'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=[context_len]))
        feature_dict['question_word_len'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=[question_len]))
        feature_dict['answer_word_len'] = tf.train.Feature(
            int64_list=tf.train.Int64List(value=[answer_len]))

        return feature_dict

    @property
    def train_db(self,):
        if self._train_db is None:
            self._train_db = tf.data.TFRecordDataset(
                DATA_STORE['squad/tfrecord/train'], num_parallel_reads=self.num_parallel_reads).map(self._map_fn)
        return self._train_db

    @property
    def val_db(self,):
        if self._val_db is None:
            self._val_db = tf.data.TFRecordDataset(
                DATA_STORE['squad/tfrecord/dev'], num_parallel_reads=self.num_parallel_reads).map(self._map_fn)
        return self._val_db

    def _map_fn(self, serialized_example):
            # Parse the DB out from the tf_record file
        features = tf.parse_single_example(
            serialized_example,
            features={'context_word_embedding': tf.FixedLenFeature([self.mwl], tf.int64),
                      'context_char_embedding': tf.FixedLenFeature([self.mwl, self.mcl], tf.int64),
                      'question_word_embedding': tf.FixedLenFeature([self.mql], tf.int64),
                      'question_char_embedding': tf.FixedLenFeature([self.mql, self.mcl], tf.int64),
                      'answer_word_embedding': tf.FixedLenFeature([self.mql], tf.int64),
                      'answer_char_embedding': tf.FixedLenFeature([self.mql, self.mcl], tf.int64),
                      'word_maxlen': tf.FixedLenFeature([], tf.int64),
                      'char_maxlen': tf.FixedLenFeature([], tf.int64),
                      'span_start': tf.FixedLenFeature([], tf.int64),
                      'span_end': tf.FixedLenFeature([], tf.int64),
                      'token_span_start': tf.FixedLenFeature([], tf.int64),
                      'token_span_end': tf.FixedLenFeature([], tf.int64),
                      'context_word_len': tf.FixedLenFeature([], tf.int64),
                      'question_word_len': tf.FixedLenFeature([], tf.int64),
                      'answer_word_len': tf.FixedLenFeature([], tf.int64)
                      })

        cwe = features['context_word_embedding']
        cce = features['context_char_embedding']
        qwe = features['question_word_embedding']
        qce = features['question_char_embedding']
        tss = tf.cast(features['token_span_start'], tf.int64)
        tse = tf.cast(features['token_span_end'], tf.int64)

        # Other stuff that isn't used yet
        awe = features['answer_word_embedding']
        ace = features['answer_char_embedding']
        wml = tf.cast(features['word_maxlen'], tf.int64)
        cml = tf.cast(features['char_maxlen'], tf.int64)
        sps = tf.cast(features['span_start'], tf.int64)
        spe = tf.cast(features['span_end'], tf.int64)
        cwl = tf.cast(features['context_word_len'], tf.int64)
        qwl = tf.cast(features['question_word_len'], tf.int64)
        awl = tf.cast(features['answer_word_len'], tf.int64)

        # This tuple is the longest, most terrible thing ever
        return (cwe, qwe, cce, qce, tss, tse, awe, ace, wml, cml, sps, spe, cwl, qwl, awl)

    def info(self, ) -> str:
        return(tabulate([['Num Train Examples', self.num_train_examples],
                        ['Num Val Examples', self.num_val_examples],
                        ['Word Vocab Size', self.word_vocab_size],
                        ['Char Vocab Size', self.char_vocab_size]]))
Beispiel #5
0
class NMT(Dataset):
    REQ_SIZE = 32509533

    available_dataset = {"English2Vietnamese": "en-vi", \
                         "English2German": "en-de"}

    def __init__(self,
                 version: str = None,
                 num_parallel_reads: Optional[int] = None,
                 force_rebuild=False,
                 nohashcheck=False) -> None:
        log_message("Building NMT...")
        if not Dataset.has_space(NMT.REQ_SIZE):
            return
        if version == None:
            log_message(
                "Please Select From following translation: en-vi, en-de")
            return
        self.num_parallel_reads = num_parallel_reads
        self.num_val_examples = None
        self.num_train_examples = None
        self.num_test_examples = None
        self.mwl = 40
        self.qwl = 40

        site_prefix = "https://nlp.stanford.edu/projects/nmt/data/"
        root_key = "nmt"

        if version == 'en-vi':
            self.root_key = os.path.join(root_key, "en-vi")
            train_eng_file = os.path.join(site_prefix,
                                          "iwslt15.en-vi/train.en")
            train_for_file = os.path.join(site_prefix,
                                          "iwslt15.en-vi/train.vi")
            val_eng_file = os.path.join(site_prefix,
                                        "iwslt15.en-vi/tst2012.en")
            val_for_file = os.path.join(site_prefix,
                                        "iwslt15.en-vi/tst2012.vi")
            test_eng_file = os.path.join(site_prefix,
                                         "iwslt15.en-vi/tst2013.en")
            test_for_file = os.path.join(site_prefix,
                                         "iwslt15.en-vi/tst2013.vi")
            vocab_eng_file = os.path.join(site_prefix,
                                          "iwslt15.en-vi/vocab.en")
            vocab_for_file = os.path.join(site_prefix,
                                          "iwslt15.en-vi/vocab.vi")
            # size = {"train_eng_file": 13603614,
            #         "train_for_file": 18074646,
            #         "val_eng_file": 140250,
            #         "val_for_file": 188396,
            #         "test_eng_file": 132264,
            #         "test_for_file": 183855,
            #         "vocab_eng_file": 139741,
            #         "vocab_for_file": 46767}

        elif version == "en-de":
            self.root_key = os.path.join(root_key, "en-de")
            train_eng_file = os.path.join(site_prefix, "wmt14.en-de/train.en")
            train_for_file = os.path.join(site_prefix, "wmt14.en-de/train.de")
            val_eng_file = os.path.join(site_prefix,
                                        "wmt14.en-de/newstest2012.en")
            val_for_file = os.path.join(site_prefix,
                                        "wmt14.en-de/newstest2012.de")
            test_eng_file = os.path.join(site_prefix,
                                         "wmt14.en-de/newstest2013.en")
            test_for_file = os.path.join(site_prefix,
                                         "wmt14.en-de/newstest2013.de")
            vocab_eng_file = os.path.join(site_prefix,
                                          "wmt14.en-de/vocab.50K.en")
            vocab_for_file = os.path.join(site_prefix,
                                          "wmt14.en-de/vocab.50K.de")
            # size = {"train_eng_file": 644874240,
            #         "train_for_file": 717225984,
            #         "val_eng_file": 406528,
            #         "val_for_file": 470016,
            #         "test_eng_file": 355328,
            #         "test_for_file": 405504,
            #         "vocab_eng_file": 404480,
            #         "vocab_for_file": 504832}
        # Download Files
        self.train_eng = maybe_download_and_store_single_file(
            train_eng_file, os.path.join(self.root_key, "train-en"))
        self.train_for = maybe_download_and_store_single_file(
            train_for_file, os.path.join(self.root_key, "train-for"))
        self.val_eng = maybe_download_and_store_single_file(
            val_eng_file, os.path.join(self.root_key, "val-en"))
        self.val_for = maybe_download_and_store_single_file(
            val_for_file, os.path.join(self.root_key, "val-for"))
        self.test_eng = maybe_download_and_store_single_file(
            test_eng_file, os.path.join(self.root_key, "test-en"))
        self.test_for = maybe_download_and_store_single_file(
            test_for_file, os.path.join(self.root_key, "test-for"))
        self.vocab_eng = maybe_download_and_store_single_file(
            vocab_eng_file, os.path.join(self.root_key, "vocab-en"))
        self.vocab_for = maybe_download_and_store_single_file(
            vocab_for_file, os.path.join(self.root_key, "vocab-for"))

        # Load the vocab files
        src_dictionary_key = os.path.join(self.root_key, "dictionary", "en")
        for_dictionary_key = os.path.join(self.root_key, "dictionary", "for")

        if not DATA_STORE.is_valid(
                src_dictionary_key) or not DATA_STORE.is_valid(
                    for_dictionary_key) or force_rebuild:
            self.src_dictionary = NLPDictionary()
            self.dst_dictionary = NLPDictionary()
        else:
            self.src_dictionary = NLPDictionary()
            self.dst_dictionary = NLPDictionary()
            self.src_dictionary.load(DATA_STORE[src_dictionary_key])
            self.dst_dictionary.load(DATA_STORE[for_dictionary_key])

        self.num_train_examples = self._build_dataset(
            "train", force_rebuild=force_rebuild)
        self.num_val_examples = self._build_dataset(
            "val", force_rebuild=force_rebuild)
        self.num_test_examples = self._build_dataset(
            "test", force_rebuild=force_rebuild)

        with open(
                DATA_STORE.create_key(src_dictionary_key,
                                      'dict.pkl',
                                      force=True), 'wb') as pkl_file:
            pickle.dump(self.src_dictionary, pkl_file)
            DATA_STORE.update_hash(src_dictionary_key)

        with open(
                DATA_STORE.create_key(for_dictionary_key,
                                      'dict.pkl',
                                      force=True), 'wb') as pkl_file:
            pickle.dump(self.dst_dictionary, pkl_file)
            DATA_STORE.update_hash(for_dictionary_key)

        self.word_vocab_size = len(self.src_dictionary.word_dictionary)

        # TODO: Add current vocab size from vocab file

        self._train_db = None
        self._val_db = None

    def build_db(self, mode="train") -> Dict:
        record_root = os.path.join(self.root_key, "tfrecord", mode)
        db = tf.data.TFRecordDataset(
            DATA_STORE[record_root],
            num_parallel_reads=self.num_parallel_reads).map(self._map_fn)
        return db

    @property
    def train_db(self, ):
        if self._train_db is None:
            self._train_db = self.build_db(mode="train")
        return self._train_db

    @property
    def val_db(self, ):
        if self._val_db is None:
            self._val_db = self.build_db(mode="val")
        return self._val_db

    def _build_dataset(self,
                       mode="train",
                       force_rebuild=False,
                       nohashcheck=False):
        # For now, we will not use the provided vocab
        record_root = os.path.join(self.root_key, "tfrecord", mode)
        if force_rebuild or not DATA_STORE.is_valid(record_root,
                                                    nohashcheck=nohashcheck):
            log_message('Building dataset ({})...'.format(mode))
            tf_record_writer = tf.python_io.TFRecordWriter(\
                DATA_STORE.create_key(record_root, 'data.tfrecords',force=force_rebuild))

            if mode == "train":
                eng_file = self.train_eng
                for_file = self.train_for
            if mode == "test":
                eng_file = self.test_eng
                for_file = self.test_for
            else:
                eng_file = self.val_eng
                for_file = self.val_for

            with codecs.getreader("utf-8")(tf.gfile.GFile(DATA_STORE[eng_file],
                                                          mode="rb")) as f:
                eng_data = f.read().splitlines()
            with codecs.getreader("utf-8")(tf.gfile.GFile(DATA_STORE[for_file],
                                                          mode="rb")) as f:
                for_data = f.read().splitlines()

            for i, line in tqdm.tqdm(enumerate(eng_data)):
                src_dense, src_len = self.src_dictionary.dense_parse(line, \
                                                                        word_padding=self.mwl, \
                                                                        char_padding=0)
                for_line = for_data[i]
                for_dense, for_len = self.dst_dictionary.dense_parse(for_line, \
                                                                    word_padding=self.mwl, \
                                                                    char_padding=0)
                feature_dict = self.build_feature_dict(src_dense[0],
                                                       for_dense[0], src_len,
                                                       for_len)

                example = tf.train.Example(features=tf.train.Features(
                    feature=feature_dict))
                tf_record_writer.write(example.SerializeToString())
            tf_record_writer.close()
            DATA_STORE.update_hash(record_root)
            return len(eng_data)
        else:
            return sum(1 for _ in tf.python_io.tf_record_iterator(
                DATA_STORE[record_root]))

    def _map_fn(self, serialized_example):
        # Parse the DB out from the tf_record file
        features = tf.parse_single_example(serialized_example,
                                           features={
                                               'eng_word_embedding':
                                               tf.FixedLenFeature([self.mwl],
                                                                  tf.int64),
                                               'foreign_word_embedding':
                                               tf.FixedLenFeature([self.mwl],
                                                                  tf.int64),
                                               'eng_word_len':
                                               tf.FixedLenFeature([],
                                                                  tf.int64),
                                               'foreign_word_len':
                                               tf.FixedLenFeature([],
                                                                  tf.int64),
                                           })

        src = features['eng_word_embedding']
        dst = features['foreign_word_embedding']
        src_len = tf.cast(features['eng_word_len'], tf.int64)
        dst_len = tf.cast(features['eng_word_len'], tf.int64)

        return (
            src,
            dst,
            src_len,
            dst_len,
        )
Beispiel #6
0
    def __init__(self,
                 version: str = None,
                 num_parallel_reads: Optional[int] = None,
                 force_rebuild=False,
                 nohashcheck=False) -> None:
        log_message("Building NMT...")
        if not Dataset.has_space(NMT.REQ_SIZE):
            return
        if version == None:
            log_message(
                "Please Select From following translation: en-vi, en-de")
            return
        self.num_parallel_reads = num_parallel_reads
        self.num_val_examples = None
        self.num_train_examples = None
        self.num_test_examples = None
        self.mwl = 40
        self.qwl = 40

        site_prefix = "https://nlp.stanford.edu/projects/nmt/data/"
        root_key = "nmt"

        if version == 'en-vi':
            self.root_key = os.path.join(root_key, "en-vi")
            train_eng_file = os.path.join(site_prefix,
                                          "iwslt15.en-vi/train.en")
            train_for_file = os.path.join(site_prefix,
                                          "iwslt15.en-vi/train.vi")
            val_eng_file = os.path.join(site_prefix,
                                        "iwslt15.en-vi/tst2012.en")
            val_for_file = os.path.join(site_prefix,
                                        "iwslt15.en-vi/tst2012.vi")
            test_eng_file = os.path.join(site_prefix,
                                         "iwslt15.en-vi/tst2013.en")
            test_for_file = os.path.join(site_prefix,
                                         "iwslt15.en-vi/tst2013.vi")
            vocab_eng_file = os.path.join(site_prefix,
                                          "iwslt15.en-vi/vocab.en")
            vocab_for_file = os.path.join(site_prefix,
                                          "iwslt15.en-vi/vocab.vi")
            # size = {"train_eng_file": 13603614,
            #         "train_for_file": 18074646,
            #         "val_eng_file": 140250,
            #         "val_for_file": 188396,
            #         "test_eng_file": 132264,
            #         "test_for_file": 183855,
            #         "vocab_eng_file": 139741,
            #         "vocab_for_file": 46767}

        elif version == "en-de":
            self.root_key = os.path.join(root_key, "en-de")
            train_eng_file = os.path.join(site_prefix, "wmt14.en-de/train.en")
            train_for_file = os.path.join(site_prefix, "wmt14.en-de/train.de")
            val_eng_file = os.path.join(site_prefix,
                                        "wmt14.en-de/newstest2012.en")
            val_for_file = os.path.join(site_prefix,
                                        "wmt14.en-de/newstest2012.de")
            test_eng_file = os.path.join(site_prefix,
                                         "wmt14.en-de/newstest2013.en")
            test_for_file = os.path.join(site_prefix,
                                         "wmt14.en-de/newstest2013.de")
            vocab_eng_file = os.path.join(site_prefix,
                                          "wmt14.en-de/vocab.50K.en")
            vocab_for_file = os.path.join(site_prefix,
                                          "wmt14.en-de/vocab.50K.de")
            # size = {"train_eng_file": 644874240,
            #         "train_for_file": 717225984,
            #         "val_eng_file": 406528,
            #         "val_for_file": 470016,
            #         "test_eng_file": 355328,
            #         "test_for_file": 405504,
            #         "vocab_eng_file": 404480,
            #         "vocab_for_file": 504832}
        # Download Files
        self.train_eng = maybe_download_and_store_single_file(
            train_eng_file, os.path.join(self.root_key, "train-en"))
        self.train_for = maybe_download_and_store_single_file(
            train_for_file, os.path.join(self.root_key, "train-for"))
        self.val_eng = maybe_download_and_store_single_file(
            val_eng_file, os.path.join(self.root_key, "val-en"))
        self.val_for = maybe_download_and_store_single_file(
            val_for_file, os.path.join(self.root_key, "val-for"))
        self.test_eng = maybe_download_and_store_single_file(
            test_eng_file, os.path.join(self.root_key, "test-en"))
        self.test_for = maybe_download_and_store_single_file(
            test_for_file, os.path.join(self.root_key, "test-for"))
        self.vocab_eng = maybe_download_and_store_single_file(
            vocab_eng_file, os.path.join(self.root_key, "vocab-en"))
        self.vocab_for = maybe_download_and_store_single_file(
            vocab_for_file, os.path.join(self.root_key, "vocab-for"))

        # Load the vocab files
        src_dictionary_key = os.path.join(self.root_key, "dictionary", "en")
        for_dictionary_key = os.path.join(self.root_key, "dictionary", "for")

        if not DATA_STORE.is_valid(
                src_dictionary_key) or not DATA_STORE.is_valid(
                    for_dictionary_key) or force_rebuild:
            self.src_dictionary = NLPDictionary()
            self.dst_dictionary = NLPDictionary()
        else:
            self.src_dictionary = NLPDictionary()
            self.dst_dictionary = NLPDictionary()
            self.src_dictionary.load(DATA_STORE[src_dictionary_key])
            self.dst_dictionary.load(DATA_STORE[for_dictionary_key])

        self.num_train_examples = self._build_dataset(
            "train", force_rebuild=force_rebuild)
        self.num_val_examples = self._build_dataset(
            "val", force_rebuild=force_rebuild)
        self.num_test_examples = self._build_dataset(
            "test", force_rebuild=force_rebuild)

        with open(
                DATA_STORE.create_key(src_dictionary_key,
                                      'dict.pkl',
                                      force=True), 'wb') as pkl_file:
            pickle.dump(self.src_dictionary, pkl_file)
            DATA_STORE.update_hash(src_dictionary_key)

        with open(
                DATA_STORE.create_key(for_dictionary_key,
                                      'dict.pkl',
                                      force=True), 'wb') as pkl_file:
            pickle.dump(self.dst_dictionary, pkl_file)
            DATA_STORE.update_hash(for_dictionary_key)

        self.word_vocab_size = len(self.src_dictionary.word_dictionary)

        # TODO: Add current vocab size from vocab file

        self._train_db = None
        self._val_db = None
Beispiel #7
0
class COCOCaptions(Dataset):
    
    """ MS COCO Caption Dataset
    """

    AMT_REQUIRED=10

    def __init__(self, num_parallel_reads: int=1, force_rebuild: bool=False, nohashcheck=False) -> None:
        # Amount of Space Check.
        if not Dataset.has_space(COCOCaptions.AMT_REQUIRED):
            return
        # Query for the data password
        if not DATA_STORE.is_valid('coco2014/data/annotations', nohashcheck=nohashcheck) or force_rebuild:
            maybe_download_and_store_zip('http://images.cocodataset.org/annotations/annotations_trainval2014.zip', 'coco2014/data/annotations', use_subkeys=False)
        if not DATA_STORE.is_valid('coco2014/data/train/images', nohashcheck=nohashcheck) or force_rebuild:
            maybe_download_and_store_zip('http://images.cocodataset.org/zips/train2014.zip', 'coco2014/data/train/images', use_subkeys=False)
        if not DATA_STORE.is_valid('coco2014/data/val/images', nohashcheck=nohashcheck) or force_rebuild:
            maybe_download_and_store_zip('http://images.cocodataset.org/zips/val2014.zip', 'coco2014/data/val/images', use_subkeys=False)

        # TODO ([email protected]) Need to make sure that this works - there could be download issues, but it's hard to say
        self.train_json_key = 'coco2014/data/annotations'
        self.val_json_key = 'coco2014/data/annotations'

        log_message("Finished Downloading")

        # Now that we have the data, load and parse the JSON files
        need_rebuild_train = force_rebuild
        if not DATA_STORE.is_valid('coco2014/tfrecord/train', nohashcheck=nohashcheck) or need_rebuild_train:
            need_rebuild_train = True
            with open(os.path.join(DATA_STORE[self.train_json_key], 'annotations/captions_train2014.json'), 'r') as annotation_file:
                self.train_json = json.loads(annotation_file.read())
        
        need_rebuild_val = force_rebuild
        if not DATA_STORE.is_valid('coco2014/tfrecord/val', nohashcheck=nohashcheck) or need_rebuild_val:
            need_rebuild_val = True
            with open(os.path.join(DATA_STORE[self.val_json_key], 'annotations/captions_val2014.json'), 'r') as annotation_file:
                self.val_json = json.loads(annotation_file.read())

        # Load the vocab files
        if not DATA_STORE.is_valid('coco2014/captions/dictionary') or force_rebuild:
            self.dictionary = NLPDictionary()
            need_rebuild_train = True
            need_rebuild_val = True
        else:
            self.dictionary = NLPDictionary()
            self.dictionary.load(DATA_STORE['coco2014/captions/dictionary'])

        # Setup some default options for the dataset
        self.max_word_length = 50
        self.max_char_length = 16
        self._val_db = None
        self._train_db = None
        self.num_parallel_reads = num_parallel_reads
        
        # Build the tfrecord dataset from the JSON
        if need_rebuild_train:
            self._build_dataset('train')
        if need_rebuild_val:
            self._build_dataset('val')

        self.train_fpath = DATA_STORE['coco2014/tfrecord/train']
        self.val_fpath = DATA_STORE['coco2014/tfrecord/val']
        log_message("Finished building tfrecords.")
        # # Compute the size of the datasets
        self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['coco2014/tfrecord/train']))
        self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['coco2014/tfrecord/val']))

        # Save the vocab
        dict_path = DATA_STORE.create_key('coco2014/captions/dictionary', 'dict.pkl', force=True)
        self.dictionary.save(dict_path)
        DATA_STORE.update_hash('coco2014/captions/dictionary')

        self.word_vocab_size = len(self.dictionary.word_dictionary)
        self.char_vocab_size = len(self.dictionary.char_dictionary)

    def _build_dataset(self, dataset: str) -> None:

        if dataset not in ['train', 'val']:
            raise ValueError("Must be building either training or validation dataset")

        # Open the TFRecordWriter
        if dataset == 'train':
            record_root = 'coco2014/tfrecord/train'
            json = self.train_json
            root_fpath = DATA_STORE['coco2014/data/train/images']
        else:
            record_root = 'coco2014/tfrecord/val'
            json = self.val_json
            root_fpath = DATA_STORE['coco2014/data/val/images']

        # Construct the record reader
        tf_record_writer = tf.python_io.TFRecordWriter(DATA_STORE.create_key(record_root, 'data.tfrecords', force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building {} dataset...'.format(dataset))
        for entry in tqdm.tqdm(json['annotations']):
            # Load the image
            image = load_image(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset))
            if image is None:
                errors += 1
                log_warning('Error loading image: {}. {} Errors so far.'.format(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset), errors))
                continue

            # Parse the caption
            caption_raw = entry['caption']
            caption_dense, caption_len = self.dictionary.dense_parse(caption_raw, word_padding=self.max_word_length, char_padding=self.max_char_length)

            # Add the image data 
            feature = {
                'caption_word_embedding': _int64_feature(np.ravel(caption_dense[0]).astype(np.int64)),
                'caption_char_embedding': _int64_feature(np.ravel(caption_dense[1]).astype(np.int64)),
                'caption_length': _int64_feature([caption_len]),
                'image_shape': _int64_feature(image.shape),
                'image': _bytes_feature(tf.compat.as_bytes(image.tostring())),
            }

            # Write the TF-Record
            example = tf.train.Example(features=tf.train.Features(feature=feature))
            tf_record_writer.write(example.SerializeToString())
        
        tf_record_writer.close()
        DATA_STORE.update_hash(record_root)

    def _map_fn(self, serialized_example):

        # Parse the DB out from the tf_record file
        features = tf.parse_single_example(
            serialized_example,
            features={'caption_word_embedding': tf.FixedLenFeature([self.max_word_length], tf.int64),
                      'caption_char_embedding': tf.FixedLenFeature([self.max_word_length, self.max_char_length], tf.int64),
                      'caption_length': tf.FixedLenFeature([1], tf.int64),
                      'image_shape': tf.FixedLenFeature([3], tf.int64),
                      'image': tf.FixedLenFeature([], tf.string),
                      })

        image_shape = features['image_shape']
        image = tf.reshape(tf.decode_raw(features['image'], tf.uint8), image_shape)

        # This tuple is the longest, most terrible thing ever
        return (features['caption_word_embedding'], features['caption_char_embedding'], features['caption_length'], image)

    @property
    def train_db(self):
        if self._train_db is None:
            self._train_db = tf.data.TFRecordDataset(
                self.train_fpath, num_parallel_reads=self.num_parallel_reads).map(self._map_fn)
        return self._train_db

    @property
    def val_db(self):
        if self._val_db is None:
            self._val_db = tf.data.TFRecordDataset(
                self.val_fpath, num_parallel_reads=self.num_parallel_reads).map(self._map_fn)
        return self._val_db

    def info(self, ) -> str:
        return(tabulate([['Num Train Examples', self.num_train_examples],
                        ['Num Val Examples', self.num_val_examples],
                        ['Word Vocab Size', self.word_vocab_size],
                        ['Char Vocab Size', self.char_vocab_size]]))
Beispiel #8
0
    def __init__(self, num_parallel_reads: int=1, force_rebuild: bool=False, nohashcheck=False) -> None:
        # Amount of Space Check.
        if not Dataset.has_space(COCOCaptions.AMT_REQUIRED):
            return
        # Query for the data password
        if not DATA_STORE.is_valid('coco2014/data/annotations', nohashcheck=nohashcheck) or force_rebuild:
            maybe_download_and_store_zip('http://images.cocodataset.org/annotations/annotations_trainval2014.zip', 'coco2014/data/annotations', use_subkeys=False)
        if not DATA_STORE.is_valid('coco2014/data/train/images', nohashcheck=nohashcheck) or force_rebuild:
            maybe_download_and_store_zip('http://images.cocodataset.org/zips/train2014.zip', 'coco2014/data/train/images', use_subkeys=False)
        if not DATA_STORE.is_valid('coco2014/data/val/images', nohashcheck=nohashcheck) or force_rebuild:
            maybe_download_and_store_zip('http://images.cocodataset.org/zips/val2014.zip', 'coco2014/data/val/images', use_subkeys=False)

        # TODO ([email protected]) Need to make sure that this works - there could be download issues, but it's hard to say
        self.train_json_key = 'coco2014/data/annotations'
        self.val_json_key = 'coco2014/data/annotations'

        log_message("Finished Downloading")

        # Now that we have the data, load and parse the JSON files
        need_rebuild_train = force_rebuild
        if not DATA_STORE.is_valid('coco2014/tfrecord/train', nohashcheck=nohashcheck) or need_rebuild_train:
            need_rebuild_train = True
            with open(os.path.join(DATA_STORE[self.train_json_key], 'annotations/captions_train2014.json'), 'r') as annotation_file:
                self.train_json = json.loads(annotation_file.read())
        
        need_rebuild_val = force_rebuild
        if not DATA_STORE.is_valid('coco2014/tfrecord/val', nohashcheck=nohashcheck) or need_rebuild_val:
            need_rebuild_val = True
            with open(os.path.join(DATA_STORE[self.val_json_key], 'annotations/captions_val2014.json'), 'r') as annotation_file:
                self.val_json = json.loads(annotation_file.read())

        # Load the vocab files
        if not DATA_STORE.is_valid('coco2014/captions/dictionary') or force_rebuild:
            self.dictionary = NLPDictionary()
            need_rebuild_train = True
            need_rebuild_val = True
        else:
            self.dictionary = NLPDictionary()
            self.dictionary.load(DATA_STORE['coco2014/captions/dictionary'])

        # Setup some default options for the dataset
        self.max_word_length = 50
        self.max_char_length = 16
        self._val_db = None
        self._train_db = None
        self.num_parallel_reads = num_parallel_reads
        
        # Build the tfrecord dataset from the JSON
        if need_rebuild_train:
            self._build_dataset('train')
        if need_rebuild_val:
            self._build_dataset('val')

        self.train_fpath = DATA_STORE['coco2014/tfrecord/train']
        self.val_fpath = DATA_STORE['coco2014/tfrecord/val']
        log_message("Finished building tfrecords.")
        # # Compute the size of the datasets
        self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['coco2014/tfrecord/train']))
        self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['coco2014/tfrecord/val']))

        # Save the vocab
        dict_path = DATA_STORE.create_key('coco2014/captions/dictionary', 'dict.pkl', force=True)
        self.dictionary.save(dict_path)
        DATA_STORE.update_hash('coco2014/captions/dictionary')

        self.word_vocab_size = len(self.dictionary.word_dictionary)
        self.char_vocab_size = len(self.dictionary.char_dictionary)
Beispiel #9
0
    def __init__(self,
                 data_type="pointing",
                 num_parallel_reads: int = 1,
                 force_rebuild: bool = False,
                 ignore_hashes=False,
                 image_shape: Sequence[int] = [448, 448],
                 read_codes=False,
                 code_shape: Sequence[int] = [7, 7, 2048],
                 merge_qa=False) -> None:

        log_message("Building Dataset " + data_type)

        self.image_resize_shape = image_shape
        self.read_codes = read_codes
        self.code_shape = code_shape
        self.merge_qa = merge_qa
        self.image_root_path = DATA_STORE["visual7w/data/images"]
        # Get all of the necessary data
        self.images_key = maybe_download_and_store_zip(
            'http://vision.stanford.edu/yukezhu/visual7w_images.zip',
            'visual7w/data/images',
            use_subkeys=False)
        # Get all of the necessary data
        self.dataset_key = maybe_download_and_store_zip(
            "http://web.stanford.edu/~yukez/papers/resources/dataset_v7w_{0}.zip"
            .format(data_type),
            'visual7w/{0}/data/json'.format(data_type),
            use_subkeys=True)
        # Get the grounding data
        self.grounding_key = maybe_download_and_store_zip(
            "http://web.stanford.edu/~yukez/papers/resources/dataset_v7w_grounding_annotations.zip",
            "visual/data/grounding",
            use_subkeys=True)

        # Compute the size of the datasets
        self.num_train_examples = 0
        self.num_val_examples = 0
        self.num_test_examples = 0

        self.max_word_length = 44
        self.max_char_length = 26

        self.data_type = data_type

        root_key = "visual7w/{0}".format(data_type)
        dict_key = os.path.join(root_key, "dictionary")
        # Load the vocab files
        if not ignore_hashes and (force_rebuild
                                  or not DATA_STORE.is_valid(dict_key)):
            self.dictionary = NLPDictionary()
            need_rebuild_train = True
            need_rebuild_val = True
        else:
            self.dictionary = NLPDictionary().load(DATA_STORE[dict_key])

        self.train_fpath = os.path.join(root_key, 'tfrecord/train')
        self.val_fpath = os.path.join(root_key, 'tfrecord/val')
        self.test_fpath = os.path.join(root_key, 'tfrecord/test')

        if force_rebuild:
            # Now that we have the data, load and parse the JSON file
            file_ = DATA_STORE[self.dataset_key[0]]
            with open(file_, 'r') as ptr:
                self._json = json.load(ptr)
            self._build_images()
            self._build_dataset()
        else:
            # Compute the size of the datasets
            self.num_train_examples = sum(
                1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[
                    os.path.join(self.train_fpath, "images")]))
            self.num_val_examples = sum(
                1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[
                    os.path.join(self.val_fpath, "images")]))

        # Setup some default options for the dataset
        self._val_db = None
        self._train_db = None
        self._test_db = None
        self.num_parallel_reads = num_parallel_reads

        # Save the vocab
        if force_rebuild:
            self.dictionary.save(
                DATA_STORE.create_key(dict_key, 'dict.pkl', force=True))
            DATA_STORE.update_hash(dict_key)

        self.word_vocab_size = len(self.dictionary.word_dictionary)
        self.char_vocab_size = len(self.dictionary.char_dictionary)
Beispiel #10
0
class Visual7W(object):
    """ One type of VQA Dataset
    http://web.stanford.edu/%7Eyukez/visual7w/
    """
    def __init__(self,
                 data_type="pointing",
                 num_parallel_reads: int = 1,
                 force_rebuild: bool = False,
                 ignore_hashes=False,
                 image_shape: Sequence[int] = [448, 448],
                 read_codes=False,
                 code_shape: Sequence[int] = [7, 7, 2048],
                 merge_qa=False) -> None:

        log_message("Building Dataset " + data_type)

        self.image_resize_shape = image_shape
        self.read_codes = read_codes
        self.code_shape = code_shape
        self.merge_qa = merge_qa
        self.image_root_path = DATA_STORE["visual7w/data/images"]
        # Get all of the necessary data
        self.images_key = maybe_download_and_store_zip(
            'http://vision.stanford.edu/yukezhu/visual7w_images.zip',
            'visual7w/data/images',
            use_subkeys=False)
        # Get all of the necessary data
        self.dataset_key = maybe_download_and_store_zip(
            "http://web.stanford.edu/~yukez/papers/resources/dataset_v7w_{0}.zip"
            .format(data_type),
            'visual7w/{0}/data/json'.format(data_type),
            use_subkeys=True)
        # Get the grounding data
        self.grounding_key = maybe_download_and_store_zip(
            "http://web.stanford.edu/~yukez/papers/resources/dataset_v7w_grounding_annotations.zip",
            "visual/data/grounding",
            use_subkeys=True)

        # Compute the size of the datasets
        self.num_train_examples = 0
        self.num_val_examples = 0
        self.num_test_examples = 0

        self.max_word_length = 44
        self.max_char_length = 26

        self.data_type = data_type

        root_key = "visual7w/{0}".format(data_type)
        dict_key = os.path.join(root_key, "dictionary")
        # Load the vocab files
        if not ignore_hashes and (force_rebuild
                                  or not DATA_STORE.is_valid(dict_key)):
            self.dictionary = NLPDictionary()
            need_rebuild_train = True
            need_rebuild_val = True
        else:
            self.dictionary = NLPDictionary().load(DATA_STORE[dict_key])

        self.train_fpath = os.path.join(root_key, 'tfrecord/train')
        self.val_fpath = os.path.join(root_key, 'tfrecord/val')
        self.test_fpath = os.path.join(root_key, 'tfrecord/test')

        if force_rebuild:
            # Now that we have the data, load and parse the JSON file
            file_ = DATA_STORE[self.dataset_key[0]]
            with open(file_, 'r') as ptr:
                self._json = json.load(ptr)
            self._build_images()
            self._build_dataset()
        else:
            # Compute the size of the datasets
            self.num_train_examples = sum(
                1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[
                    os.path.join(self.train_fpath, "images")]))
            self.num_val_examples = sum(
                1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[
                    os.path.join(self.val_fpath, "images")]))

        # Setup some default options for the dataset
        self._val_db = None
        self._train_db = None
        self._test_db = None
        self.num_parallel_reads = num_parallel_reads

        # Save the vocab
        if force_rebuild:
            self.dictionary.save(
                DATA_STORE.create_key(dict_key, 'dict.pkl', force=True))
            DATA_STORE.update_hash(dict_key)

        self.word_vocab_size = len(self.dictionary.word_dictionary)
        self.char_vocab_size = len(self.dictionary.char_dictionary)

    def get_boxes(self, box_id, boxes_dict):
        assert type(box_id) == int
        curr_box = boxes_dict[box_id]
        _box_id = curr_box['box_id']
        assert box_id == _box_id
        answer_name = curr_box['name']
        answer_loc = [
            curr_box['x'], curr_box['y'], curr_box['width'], curr_box['height']
        ]
        answer_dense, answer_len = self.dictionary.dense_parse(
            answer_name,
            word_padding=self.max_word_length,
            char_padding=self.max_char_length)
        return (answer_loc, answer_dense, answer_len)

    def _build_images(self, ) -> None:
        # Define the Record Root

        # Open the TFRecordWriter
        train_record_root = os.path.join(self.train_fpath, "images")
        val_record_root = os.path.join(self.val_fpath, "images")
        test_record_root = os.path.join(self.test_fpath, "images")

        # Construct the record reader
        train_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(train_record_root,
                                  'data.tfrecords',
                                  force=True))
        val_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(val_record_root,
                                  'data.tfrecords',
                                  force=True))
        test_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(test_record_root,
                                  'data.tfrecords',
                                  force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building the image...')

        images = self._json['images']

        total_num_examples = len(images)
        for idx, entry in tqdm.tqdm(enumerate(images),
                                    total=total_num_examples):
            # Load the image
            filename = entry['filename']
            image_path = os.path.join(self.image_root_path, "images", filename)
            assert os.path.exists(image_path)
            image = load_image(image_path)
            image_shape = list(image.shape)
            image = encode_jpeg(image)
            if image is None:
                errors += 1
                log_warning(
                    'Error loading image: {}. {} Errors so far.'.format(
                        os.path.join(self.image_root_path, "images", filename),
                        errors))
                continue

            # Split the dataset
            split = entry["split"]
            if split == "val":
                tf_record_writer = val_record_writer
            elif split == "test":
                tf_record_writer = test_record_writer
            else:
                tf_record_writer = train_record_writer

            image_id = entry['image_id']

            feature = {
                'image_size': _int64_feature(image_shape),
                'image_id': _int64_feature([image_id]),
                'image': _bytes_feature(tf.compat.as_bytes(image)),
            }
            # Write the TF-Record
            example = tf.train.Example(features=tf.train.Features(
                feature=feature))

            tf_record_writer.write(example.SerializeToString())

        val_record_writer.close()
        train_record_writer.close()
        test_record_writer.close()
        DATA_STORE.update_hash(test_record_root)
        DATA_STORE.update_hash(train_record_root)
        DATA_STORE.update_hash(val_record_root)

    def _build_dataset(self, ) -> None:
        # Define the Record Root

        # Open the TFRecordWriter
        train_record_root = os.path.join(self.train_fpath, "data")
        val_record_root = os.path.join(self.val_fpath, "data")
        test_record_root = os.path.join(self.test_fpath, "data")

        # Construct the record reader
        train_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(train_record_root,
                                  'data.tfrecords',
                                  force=True))
        val_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(val_record_root,
                                  'data.tfrecords',
                                  force=True))
        test_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(test_record_root,
                                  'data.tfrecords',
                                  force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building the dataset...')

        images = self._json['images']
        if self.data_type == "pointing":
            boxes = self._json['boxes']
            boxes_dict = {d["box_id"]: d for d in boxes}

        total_num_examples = len(images)
        for idx, entry in tqdm.tqdm(enumerate(images),
                                    total=total_num_examples):
            # Load the image

            # Split the dataset
            split = entry["split"]
            if split == "val":
                tf_record_writer = val_record_writer
                self.num_val_examples += 1
            elif split == "test":
                tf_record_writer = test_record_writer
                self.num_test_examples += 1
            else:
                tf_record_writer = train_record_writer
                self.num_train_examples += 1

            image_id = entry['image_id']
            qa_pairs = entry['qa_pairs']

            for qa in qa_pairs:
                question_raw = qa['question']
                question_type = qa['type']
                qa_id = qa['qa_id']
                mlt_choice = qa["multiple_choices"]
                answer = qa['answer']

                assert len(mlt_choice) == 3
                question_dense, question_len = self.dictionary.dense_parse(
                    question_raw,
                    word_padding=self.max_word_length,
                    char_padding=self.max_char_length)

                if self.data_type == "telling":
                    answer_dense, answer_len = self.dictionary.dense_parse(
                        answer,
                        word_padding=self.max_word_length,
                        char_padding=self.max_char_length)
                    m1_dense, m1_len = self.dictionary.dense_parse(
                        mlt_choice[0],
                        word_padding=self.max_word_length,
                        char_padding=self.max_char_length)
                    m2_dense, m2_len = self.dictionary.dense_parse(
                        mlt_choice[1],
                        word_padding=self.max_word_length,
                        char_padding=self.max_char_length)
                    m3_dense, m3_len = self.dictionary.dense_parse(
                        mlt_choice[2],
                        word_padding=self.max_word_length,
                        char_padding=self.max_char_length)

                    # Add the image data
                    feature = {
                        'question_word_embedding':
                        _int64_feature(
                            np.ravel(question_dense[0]).astype(np.int64)),
                        'question_char_embedding':
                        _int64_feature(
                            np.ravel(question_dense[1]).astype(np.int64)),
                        'question_length':
                        _int64_feature([question_len]),
                        'ans_word_embedding':
                        _int64_feature(
                            np.ravel(answer_dense[0]).astype(np.int64)),
                        'ans_char_embedding':
                        _int64_feature(
                            np.ravel(answer_dense[1]).astype(np.int64)),
                        'ans_length':
                        _int64_feature([answer_len]),
                        'm1_embedding':
                        _int64_feature(np.ravel(m1_dense[0]).astype(np.int64)),
                        'm1_char_embedding':
                        _int64_feature(np.ravel(m1_dense[1]).astype(np.int64)),
                        'm2_embedding':
                        _int64_feature(np.ravel(m2_dense[0]).astype(np.int64)),
                        'm2_char_embedding':
                        _int64_feature(np.ravel(m2_dense[1]).astype(np.int64)),
                        'm3_embedding':
                        _int64_feature(np.ravel(m3_dense[0]).astype(np.int64)),
                        'm3_char_embedding':
                        _int64_feature(np.ravel(m3_dense[1]).astype(np.int64)),
                        'mc_len':
                        _int64_feature([m1_len, m2_len, m3_len]),
                        "q_type":
                        _bytes_feature(tf.compat.as_bytes(question_type)),
                        'qa_id':
                        _int64_feature([qa_id]),
                        'image_id':
                        _int64_feature([image_id]),
                    }

                else:
                    answer_loc, answer_dense, answer_len = self.get_boxes(
                        answer, boxes_dict)
                    m1_loc, m1_dense, m1_len = self.get_boxes(
                        mlt_choice[0], boxes_dict)
                    m2_loc, m2_dense, m2_len = self.get_boxes(
                        mlt_choice[1], boxes_dict)
                    m3_loc, m3_dense, m3_len = self.get_boxes(
                        mlt_choice[2], boxes_dict)
                    coord = answer_loc + m1_loc + m2_loc + m3_loc
                    # Add the image data
                    feature = {
                        'question_word_embedding':
                        _int64_feature(
                            np.ravel(question_dense[0]).astype(np.int64)),
                        'question_char_embedding':
                        _int64_feature(
                            np.ravel(question_dense[1]).astype(np.int64)),
                        'question_length':
                        _int64_feature([question_len]),
                        'ans_word_embedding':
                        _int64_feature(
                            np.ravel(answer_dense[0]).astype(np.int64)),
                        'ans_char_embedding':
                        _int64_feature(
                            np.ravel(answer_dense[1]).astype(np.int64)),
                        'ans_length':
                        _int64_feature([answer_len]),
                        "coordinate":
                        _int64_feature(coord),
                        'm1_embedding':
                        _int64_feature(np.ravel(m1_dense[0]).astype(np.int64)),
                        'm1_char_embedding':
                        _int64_feature(np.ravel(m1_dense[1]).astype(np.int64)),
                        'm2_embedding':
                        _int64_feature(np.ravel(m2_dense[0]).astype(np.int64)),
                        'm2_char_embedding':
                        _int64_feature(np.ravel(m2_dense[1]).astype(np.int64)),
                        'm3_embedding':
                        _int64_feature(np.ravel(m3_dense[0]).astype(np.int64)),
                        'm3_char_embedding':
                        _int64_feature(np.ravel(m3_dense[1]).astype(np.int64)),
                        'mc_len':
                        _int64_feature([m1_len, m2_len, m3_len]),
                        'qa_id':
                        _int64_feature([qa_id]),
                        "q_type":
                        _bytes_feature(tf.compat.as_bytes(question_type)),
                        'image_id':
                        _int64_feature([image_id]),
                    }

                example = tf.train.Example(features=tf.train.Features(
                    feature=feature))

                tf_record_writer.write(example.SerializeToString())

        val_record_writer.close()
        train_record_writer.close()
        test_record_writer.close()
        DATA_STORE.update_hash(test_record_root)
        DATA_STORE.update_hash(train_record_root)
        DATA_STORE.update_hash(val_record_root)

    def _map_image_fn(self, serialized_example):

        feature_dict = {
            'image_size': tf.FixedLenFeature([3], tf.int64),
            'image_id': tf.FixedLenFeature([1], tf.int64),
            'image': tf.FixedLenFeature([], tf.string),
        }

        # if self.read_codes:
        #     feature_dict['image_code'] = tf.FixedLenFeature([self.code_shape[0] * self.code_shape[1] * self.code_shape[2]], tf.float32)

        features = tf.parse_single_example(serialized_example,
                                           features=feature_dict)

        image = tf.image.decode_jpeg(features['image'], channels=3)
        image = tf.cast(image, tf.float32) / 255.0

        if self.data_type == "telling":
            image = tf.image.resize_images(image, self.image_resize_shape)
            image.set_shape(
                (self.image_resize_shape[0], self.image_resize_shape[1], 3))

        return (image, features['image_size'], features['image_id'])

    def _map_dataset_fn(self, serialized_example):
        if self.data_type == "pointing":
            feature_dict = {
                'question_word_embedding':
                tf.FixedLenFeature([self.max_word_length], tf.int64),
                'question_char_embedding':
                tf.FixedLenFeature(
                    [self.max_word_length, self.max_char_length], tf.int64),
                "coordinate":
                tf.FixedLenFeature([16], tf.int64),
                'question_length':
                tf.FixedLenFeature([1], tf.int64),
                'ans_word_embedding':
                tf.FixedLenFeature([self.max_word_length], tf.int64),
                'ans_char_embedding':
                tf.FixedLenFeature(
                    [self.max_word_length, self.max_char_length], tf.int64),
                'ans_length':
                tf.FixedLenFeature([1], tf.int64),
                'm1_embedding':
                tf.FixedLenFeature([self.max_word_length], tf.int64),
                'm1_char_embedding':
                tf.FixedLenFeature(
                    [self.max_word_length, self.max_char_length], tf.int64),
                'm2_embedding':
                tf.FixedLenFeature([self.max_word_length], tf.int64),
                'm2_char_embedding':
                tf.FixedLenFeature(
                    [self.max_word_length, self.max_char_length], tf.int64),
                'm3_embedding':
                tf.FixedLenFeature([self.max_word_length], tf.int64),
                'm3_char_embedding':
                tf.FixedLenFeature(
                    [self.max_word_length, self.max_char_length], tf.int64),
                'mc_len':
                tf.FixedLenFeature([3], tf.int64),
                'qa_id':
                tf.FixedLenFeature([1], tf.int64),
                'image_id':
                tf.FixedLenFeature([1], tf.int64),
                "q_type":
                tf.FixedLenFeature([], tf.string)
            }
        else:
            feature_dict = {
                'question_word_embedding':
                tf.FixedLenFeature([self.max_word_length], tf.int64),
                'question_char_embedding':
                tf.FixedLenFeature(
                    [self.max_word_length, self.max_char_length], tf.int64),
                'question_length':
                tf.FixedLenFeature([1], tf.int64),
                'ans_word_embedding':
                tf.FixedLenFeature([self.max_word_length], tf.int64),
                'ans_char_embedding':
                tf.FixedLenFeature(
                    [self.max_word_length, self.max_char_length], tf.int64),
                'ans_length':
                tf.FixedLenFeature([1], tf.int64),
                'm1_embedding':
                tf.FixedLenFeature([self.max_word_length], tf.int64),
                'm1_char_embedding':
                tf.FixedLenFeature(
                    [self.max_word_length, self.max_char_length], tf.int64),
                'm2_embedding':
                tf.FixedLenFeature([self.max_word_length], tf.int64),
                'm2_char_embedding':
                tf.FixedLenFeature(
                    [self.max_word_length, self.max_char_length], tf.int64),
                'm3_embedding':
                tf.FixedLenFeature([self.max_word_length], tf.int64),
                'm3_char_embedding':
                tf.FixedLenFeature(
                    [self.max_word_length, self.max_char_length], tf.int64),
                'mc_len':
                tf.FixedLenFeature([3], tf.int64),
                'qa_id':
                tf.FixedLenFeature([1], tf.int64),
                'image_id':
                tf.FixedLenFeature([1], tf.int64),
                "q_type":
                tf.FixedLenFeature([], tf.string)
            }

        features = tf.parse_single_example(serialized_example,
                                           features=feature_dict)

        if self.data_type == "pointing":
            return (features['question_word_embedding'],
                    features['question_char_embedding'],
                    features['question_length'],
                    features['ans_word_embedding'],
                    features['ans_char_embedding'], features['ans_length'],
                    features['m1_embedding'], features['m1_char_embedding'],
                    features['m2_embedding'], features['m2_char_embedding'],
                    features['m3_embedding'], features['m3_char_embedding'],
                    features['mc_len'], features['coordinate'],
                    features['qa_id'], features['q_type'],
                    features['image_id'])
        else:
            return (features['question_word_embedding'],
                    features['question_char_embedding'],
                    features['question_length'],
                    features['ans_word_embedding'],
                    features['ans_char_embedding'], features['ans_length'],
                    features['m1_embedding'], features['m1_char_embedding'],
                    features['m2_embedding'], features['m2_char_embedding'],
                    features['m3_embedding'], features['m3_char_embedding'],
                    features['mc_len'], features['qa_id'], features['q_type'],
                    features['image_id'])

    @property
    def train_db(self):
        data_path = DATA_STORE[os.path.join(self.train_fpath, "data")]
        image_path = DATA_STORE[os.path.join(self.train_fpath, "images")]
        if self._train_db is None:
            self._train_db = (tf.data.TFRecordDataset(image_path).map(
                self._map_image_fn,
                num_parallel_calls=self.num_parallel_reads),
                              tf.data.TFRecordDataset(data_path).map(
                                  self._map_dataset_fn,
                                  num_parallel_calls=self.num_parallel_reads))
        return self._train_db

    @property
    def val_db(self):
        data_path = DATA_STORE[os.path.join(self.val_fpath, "data")]
        image_path = DATA_STORE[os.path.join(self.val_fpath, "images")]
        if self._val_db is None:
            self._val_db = (tf.data.TFRecordDataset(image_path).map(
                self._map_image_fn,
                num_parallel_calls=self.num_parallel_reads),
                            tf.data.TFRecordDataset(data_path).map(
                                self._map_dataset_fn,
                                num_parallel_calls=self.num_parallel_reads))
        return self._val_db

    @property
    def test_db(self):
        data_path = DATA_STORE[os.path.join(self.test_fpath, "data")]
        image_path = DATA_STORE[os.path.join(self.test_fpath, "images")]
        if self._test_db is None:
            self._test_db = (tf.data.TFRecordDataset(image_path).map(
                self._map_image_fn,
                num_parallel_calls=self.num_parallel_reads),
                             tf.data.TFRecordDataset(data_path).map(
                                 self._map_dataset_fn,
                                 num_parallel_calls=self.num_parallel_reads))
        return self._test_db

    def info(self, ) -> str:
        return (tabulate([['Num Train Examples', self.num_train_examples],
                          ['Num Val Examples', self.num_val_examples],
                          ['Word Vocab Size', self.word_vocab_size],
                          ['Char Vocab Size', self.char_vocab_size]]))
Beispiel #11
0
    def __init__(self, version='0.3', num_parallel_reads: Optional[int]=None,
                 force_rebuild: bool=False, mask: bool=False,
                 add_start_tokens: bool=False, add_stop_tokens: bool=False,
                 use_qam: bool=False) -> None:

        self.version = version
        self.num_parallel_reads = num_parallel_reads
        self.mask = mask
        self.add_start_tokens = add_start_tokens
        self.add_stop_tokens = add_stop_tokens
        self.use_qam = use_qam

        # We keep one copy of masked data, and one copy of unmasked data
        if self.mask:
            self.stem = 'newslens/masked/'
        else:
            self.stem = 'newslens/'

        # We don't use the stem here, because the json files are the same
        if self.version == '0.1':
            # Download the training data
            self.json_key = maybe_download_and_store_single_file(
                url='https://newslens.berkeley.edu/QA_dataset0.1.json', key='newslens/json_0.1')

            self.mwl = 766
            self.mcl = 37
            self.mql = 766

        elif self.version == '0.2':
            # Download the training data
            self.json_key = maybe_download_and_store_single_file(
                url='https://newslens.berkeley.edu/QA_dataset0.2.json', key='newslens/json_0.2')
            self.mwl = 595
            self.mcl = 16
            self.mql = 766

        elif self.version == '0.3':
            # Download the training data
            self.json_key = maybe_download_and_store_single_file(
                url='https://newslens.berkeley.edu/QA_dataset0.3.json', key='newslens/json_0.3')
            self.mwl = 600
            self.mcl = 16
            self.mql = 20
        else:
            raise ValueError("Invalid version for NLQA dataset")

        # Read the JSON
        with open(DATA_STORE[self.json_key], 'r') as json_file:
            self.json = json.loads(json_file.read())

        # Parse the JSON
        if not force_rebuild and DATA_STORE.is_valid(self.stem + 'dictionary_{}'.format(self.version)):
            with open(DATA_STORE[self.stem + 'dictionary_{}'.format(self.version)], 'rb') as pkl_file:
                self.dictionary = pickle.load(pkl_file)
        else:
            self.dictionary = NLPDictionary(tokenizer='space', dtype=np.int32)

        # If the tf-records don't exist, build them
        if force_rebuild or not DATA_STORE.is_valid(self.stem + 'tfrecord/train/data_{}'.format(self.version)) or not DATA_STORE.is_valid(self.stem + 'tfrecord/val/data_{}'.format(self.version)):
            log_message('Building dataset...')

            # Create the tf-record writer
            train_record_writer = tf.python_io.TFRecordWriter(
                DATA_STORE.create_key(self.stem + 'tfrecord/train/data_{}'.format(self.version), 'data.tfrecords', force=force_rebuild))
            val_record_writer = tf.python_io.TFRecordWriter(
                DATA_STORE.create_key(self.stem + 'tfrecord/val/data_{}'.format(self.version), 'data.tfrecords', force=force_rebuild))

            # Parse the data into tf-records
            for record in tqdm.tqdm(self.json):
        
                # Handle start and stop tokens on the answer
                if self.add_stop_tokens:
                    if self.mask:
                        answer_text = record['masked_answer'].strip() + ' <STOP>'
                    else:
                        answer_text = record['real_answer'].strip() + ' <STOP>'
                else:
                    if self.mask:
                        answer_text = record['masked_answer']
                    else:
                        answer_text = record['real_answer']

                if self.add_start_tokens:
                    answer_text = '<START> ' + answer_text
                if not self.add_stop_tokens:
                    question_answer_dense, qa_len = self.dictionary.dense_parse(record['question'].strip() + ' ' + answer_text.strip() + '<STOP>', word_padding=self.mwl, char_padding=self.mcl)
                else:
                    question_answer_dense, qa_len = self.dictionary.dense_parse(record['question'].strip() + ' ' + answer_text.strip(), word_padding=self.mwl, char_padding=self.mcl)

                if self.mask:
                    tokens = record['masked_document'].split(' ')
                    context_dense, context_len = self.dictionary.dense_parse(record['masked_document'], word_padding=self.mwl, char_padding=self.mcl)
                    label = record['masked_answer'].split(' ')
                else:
                    tokens = record['unmasked_document'].split(' ')
                    context_dense, context_len = self.dictionary.dense_parse(record['unmasked_document'], word_padding=self.mwl, char_padding=self.mcl)
                    label = record['real_answer'].split(' ')

                answer_dense, answer_len = self.dictionary.dense_parse(answer_text, word_padding=self.mql, char_padding=self.mcl)

                question_dense, question_len = self.dictionary.dense_parse(record['question'], word_padding=self.mql, char_padding=self.mcl)

                # Here's a bit of logic to parse out the tokens properly
                potential_starts = [x for x in range(len(tokens)) if tokens[x] == label[0]]
                label_index_start: List[int] = []
                label_index_end: List[int] = []
                for i in potential_starts:
                    idx = [x for x in range(
                        i, len(tokens)) if tokens[x] == label[-1]]
                    if len(idx) > 0:
                        label_index_start.append(i)
                        label_index_end.append(idx[0])
                label_indices = zip(label_index_start, label_index_end)

                if np.random.random() < 0.95:
                    val = False
                else:
                    val = True

                for l_ind in label_indices:

                    # Built the dataset/tf-records
                    feature_dict = {}
                    feature_dict['context_word_embedding'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=np.ravel(context_dense[0])))
                    feature_dict['context_char_embedding'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=np.ravel(context_dense[1])))
                    feature_dict['question_word_embedding'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=np.ravel(question_dense[0])))
                    feature_dict['question_char_embedding'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=np.ravel(question_dense[1])))
                    feature_dict['answer_word_embedding'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=np.ravel(answer_dense[0])))
                    feature_dict['question_answer_word_embedding'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=np.ravel(question_answer_dense[0])))
                    feature_dict['word_maxlen'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[self.mwl]))
                    feature_dict['char_maxlen'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[self.mcl]))
                    feature_dict['token_label_start'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[l_ind[0]]))
                    feature_dict['token_label_end'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[l_ind[1]]))
                    feature_dict['context_word_len'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[context_len]))
                    feature_dict['question_word_len'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[question_len]))
                    feature_dict['question_answer_word_len'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[qa_len]))
                    feature_dict['answer_word_len'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[answer_len]))

                    example = tf.train.Example(
                        features=tf.train.Features(feature=feature_dict))

                    if val:
                        val_record_writer.write(
                            example.SerializeToString())
                    else:
                        train_record_writer.write(
                            example.SerializeToString())

            train_record_writer.close()
            val_record_writer.close()
            DATA_STORE.update_hash(
                self.stem + 'tfrecord/train/data_{}'.format(self.version))
            DATA_STORE.update_hash(
                self.stem + 'tfrecord/val/data_{}'.format(self.version))

        # Save the dictionary
        with open(DATA_STORE.create_key(self.stem + 'dictionary_{}'.format(self.version), 'dict.pkl', force=True), 'wb') as pkl_file:
            pickle.dump(self.dictionary, pkl_file)
            DATA_STORE.update_hash(
                self.stem + 'dictionary_{}'.format(self.version))

        # Compute the number of training examples in the document
        self.num_val_examples = sum(
            1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.stem + 'tfrecord/val/data_{}'.format(self.version)]))
        self.num_train_examples = sum(
            1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.stem + 'tfrecord/train/data_{}'.format(self.version)]))

        self.word_vocab_size = len(self.dictionary.word_dictionary)
        self.char_vocab_size = len(self.dictionary.char_dictionary)

        self._dev_db = None
        self._train_db = None
Beispiel #12
0
    def __init__(self,
                 num_parallel_reads: int = 1,
                 force_rebuild: bool = False,
                 ignore_hashes=False,
                 image_shape: Sequence[int] = [224, 224],
                 read_codes=False,
                 code_shape: Sequence[int] = [7, 7, 2048],
                 merge_qa=False) -> None:

        self.image_resize_shape = image_shape
        self.read_codes = read_codes
        self.code_shape = code_shape
        self.merge_qa = merge_qa

        # Get all of the necessary data
        self.train_a_json_key = maybe_download_and_store_zip(
            'http://visualqa.org/data/mscoco/vqa/v2_Annotations_Train_mscoco.zip',
            'coco2014/data/train/annotations')[0]
        self.val_a_json_key = maybe_download_and_store_zip(
            'http://visualqa.org/data/mscoco/vqa/v2_Annotations_Val_mscoco.zip',
            'coco2014/data/val/annotations')[0]
        self.train_q_json_key = maybe_download_and_store_zip(
            'http://visualqa.org/data/mscoco/vqa/v2_Questions_Train_mscoco.zip',
            'coco2014/data/train/questions')[0]
        self.val_q_json_key = maybe_download_and_store_zip(
            'http://visualqa.org/data/mscoco/vqa/v2_Questions_Val_mscoco.zip',
            'coco2014/data/val/questions')[0]
        maybe_download_and_store_zip(
            'http://images.cocodataset.org/zips/train2014.zip',
            'coco2014/data/train/images',
            use_subkeys=False)
        maybe_download_and_store_zip(
            'http://images.cocodataset.org/zips/val2014.zip',
            'coco2014/data/val/images',
            use_subkeys=False)

        # Compute the size of the datasets
        self.num_train_examples = 443757
        self.num_val_examples = 214654
        self.num_classes = 29332

        # Now that we have the data, load and parse the JSON files
        need_rebuild_train = force_rebuild
        if not ignore_hashes and (
                need_rebuild_train
                or not DATA_STORE.is_valid('vqa/tfrecord/train')):
            log_message(
                'Need to rebuild training data. Loading JSON annotations.')
            need_rebuild_train = True
            with open(DATA_STORE[self.train_a_json_key],
                      'r') as annotation_file:
                self.train_a_json = json.loads(annotation_file.read())
            with open(DATA_STORE[self.train_q_json_key],
                      'r') as annotation_file:
                self.train_q_json = json.loads(annotation_file.read())

        need_rebuild_val = force_rebuild
        if not ignore_hashes and (need_rebuild_val or
                                  not DATA_STORE.is_valid('vqa/tfrecord/val')):
            log_message(
                'Need to rebuild validation data. Loading JSON annotations.')
            need_rebuild_val = True
            with open(DATA_STORE[self.val_a_json_key], 'r') as annotation_file:
                self.val_a_json = json.loads(annotation_file.read())
            with open(DATA_STORE[self.val_q_json_key], 'r') as annotation_file:
                self.val_q_json = json.loads(annotation_file.read())

        # Load the vocab files
        if not ignore_hashes and (force_rebuild or
                                  not DATA_STORE.is_valid('vqa/dictionary')):
            self.dictionary = NLPDictionary()
            need_rebuild_train = True
            need_rebuild_val = True
        else:
            with open(DATA_STORE['vqa/dictionary'], 'rb') as dict_file:
                self.dictionary = pickle.load(dict_file)

        if not ignore_hashes and (force_rebuild
                                  or not DATA_STORE.is_valid('vqa/class_map')):
            self.class_map: Dict[str, int] = {}
            need_rebuild_train = True
            need_rebuild_val = True
        else:
            with open(DATA_STORE['vqa/class_map'], 'rb') as class_map_file:
                self.class_map = pickle.load(class_map_file)

        # Setup some default options for the dataset
        self.max_word_length = 50
        self.max_char_length = 16
        self._val_db = None
        self._train_db = None
        self.num_parallel_reads = num_parallel_reads
        # Build the tfrecord dataset from the JSON
        if need_rebuild_train:
            self._build_dataset('train')
        if need_rebuild_val:
            self._build_dataset('val')

        self.train_fpath = DATA_STORE['vqa/tfrecord/train']
        self.val_fpath = DATA_STORE['vqa/tfrecord/val']

        # Save the vocab
        with open(
                DATA_STORE.create_key('vqa/dictionary', 'dict.pkl',
                                      force=True), 'wb') as pkl_file:
            pickle.dump(self.dictionary, pkl_file)
            DATA_STORE.update_hash('vqa/dictionary')
        with open(
                DATA_STORE.create_key('vqa/class_map',
                                      'class_map.pkl',
                                      force=True), 'wb') as pkl_file:
            pickle.dump(self.class_map, pkl_file)
            DATA_STORE.update_hash('vqa/class_map')

        self.word_vocab_size = len(self.dictionary.word_dictionary)
        self.char_vocab_size = len(self.dictionary.char_dictionary)
Beispiel #13
0
class VQA(Dataset):
    """ VQA Caption Dataset Downloader
    """
    def __init__(self,
                 num_parallel_reads: int = 1,
                 force_rebuild: bool = False,
                 ignore_hashes=False,
                 image_shape: Sequence[int] = [224, 224],
                 read_codes=False,
                 code_shape: Sequence[int] = [7, 7, 2048],
                 merge_qa=False) -> None:

        self.image_resize_shape = image_shape
        self.read_codes = read_codes
        self.code_shape = code_shape
        self.merge_qa = merge_qa

        # Get all of the necessary data
        self.train_a_json_key = maybe_download_and_store_zip(
            'http://visualqa.org/data/mscoco/vqa/v2_Annotations_Train_mscoco.zip',
            'coco2014/data/train/annotations')[0]
        self.val_a_json_key = maybe_download_and_store_zip(
            'http://visualqa.org/data/mscoco/vqa/v2_Annotations_Val_mscoco.zip',
            'coco2014/data/val/annotations')[0]
        self.train_q_json_key = maybe_download_and_store_zip(
            'http://visualqa.org/data/mscoco/vqa/v2_Questions_Train_mscoco.zip',
            'coco2014/data/train/questions')[0]
        self.val_q_json_key = maybe_download_and_store_zip(
            'http://visualqa.org/data/mscoco/vqa/v2_Questions_Val_mscoco.zip',
            'coco2014/data/val/questions')[0]
        maybe_download_and_store_zip(
            'http://images.cocodataset.org/zips/train2014.zip',
            'coco2014/data/train/images',
            use_subkeys=False)
        maybe_download_and_store_zip(
            'http://images.cocodataset.org/zips/val2014.zip',
            'coco2014/data/val/images',
            use_subkeys=False)

        # Compute the size of the datasets
        self.num_train_examples = 443757
        self.num_val_examples = 214654
        self.num_classes = 29332

        # Now that we have the data, load and parse the JSON files
        need_rebuild_train = force_rebuild
        if not ignore_hashes and (
                need_rebuild_train
                or not DATA_STORE.is_valid('vqa/tfrecord/train')):
            log_message(
                'Need to rebuild training data. Loading JSON annotations.')
            need_rebuild_train = True
            with open(DATA_STORE[self.train_a_json_key],
                      'r') as annotation_file:
                self.train_a_json = json.loads(annotation_file.read())
            with open(DATA_STORE[self.train_q_json_key],
                      'r') as annotation_file:
                self.train_q_json = json.loads(annotation_file.read())

        need_rebuild_val = force_rebuild
        if not ignore_hashes and (need_rebuild_val or
                                  not DATA_STORE.is_valid('vqa/tfrecord/val')):
            log_message(
                'Need to rebuild validation data. Loading JSON annotations.')
            need_rebuild_val = True
            with open(DATA_STORE[self.val_a_json_key], 'r') as annotation_file:
                self.val_a_json = json.loads(annotation_file.read())
            with open(DATA_STORE[self.val_q_json_key], 'r') as annotation_file:
                self.val_q_json = json.loads(annotation_file.read())

        # Load the vocab files
        if not ignore_hashes and (force_rebuild or
                                  not DATA_STORE.is_valid('vqa/dictionary')):
            self.dictionary = NLPDictionary()
            need_rebuild_train = True
            need_rebuild_val = True
        else:
            with open(DATA_STORE['vqa/dictionary'], 'rb') as dict_file:
                self.dictionary = pickle.load(dict_file)

        if not ignore_hashes and (force_rebuild
                                  or not DATA_STORE.is_valid('vqa/class_map')):
            self.class_map: Dict[str, int] = {}
            need_rebuild_train = True
            need_rebuild_val = True
        else:
            with open(DATA_STORE['vqa/class_map'], 'rb') as class_map_file:
                self.class_map = pickle.load(class_map_file)

        # Setup some default options for the dataset
        self.max_word_length = 50
        self.max_char_length = 16
        self._val_db = None
        self._train_db = None
        self.num_parallel_reads = num_parallel_reads
        # Build the tfrecord dataset from the JSON
        if need_rebuild_train:
            self._build_dataset('train')
        if need_rebuild_val:
            self._build_dataset('val')

        self.train_fpath = DATA_STORE['vqa/tfrecord/train']
        self.val_fpath = DATA_STORE['vqa/tfrecord/val']

        # Save the vocab
        with open(
                DATA_STORE.create_key('vqa/dictionary', 'dict.pkl',
                                      force=True), 'wb') as pkl_file:
            pickle.dump(self.dictionary, pkl_file)
            DATA_STORE.update_hash('vqa/dictionary')
        with open(
                DATA_STORE.create_key('vqa/class_map',
                                      'class_map.pkl',
                                      force=True), 'wb') as pkl_file:
            pickle.dump(self.class_map, pkl_file)
            DATA_STORE.update_hash('vqa/class_map')

        self.word_vocab_size = len(self.dictionary.word_dictionary)
        self.char_vocab_size = len(self.dictionary.char_dictionary)

    def _build_dataset(self, dataset: str) -> None:

        # Open the TFRecordWriter
        if dataset == 'train':
            record_root = 'vqa/tfrecord/train'
            json_a = self.train_a_json
            json_q = self.train_q_json
            root_fpath = DATA_STORE['coco2014/data/train/images']
            example_numbers = self.num_train_examples
        else:
            record_root = 'vqa/tfrecord/val'
            json_a = self.val_a_json
            json_q = self.val_q_json
            root_fpath = DATA_STORE['coco2014/data/val/images']
            example_numbers = self.num_val_examples

        # Construct the record reader
        tf_record_writer = tf.python_io.TFRecordWriter(
            DATA_STORE.create_key(record_root, 'data.tfrecords', force=True))

        # Loop over the data and parse
        errors = 0
        log_message('Building {} dataset...'.format(dataset))
        for idx, entry in tqdm.tqdm(enumerate(json_q['questions']),
                                    total=example_numbers):
            # Load the image
            image = load_image(
                build_fpath_from_image_id(root_fpath, entry['image_id'],
                                          dataset))
            image = encode_jpeg(image)
            if image is None:
                errors += 1
                log_warning(
                    'Error loading image: {}. {} Errors so far.'.format(
                        build_fpath_from_image_id(root_fpath,
                                                  entry['image_id'], dataset),
                        errors))
                continue

            # Parse the caption
            assert entry['question_id'] == json_a['annotations'][idx][
                'question_id']
            question_raw = entry['question']
            question_dense, question_len = self.dictionary.dense_parse(
                question_raw,
                word_padding=self.max_word_length,
                char_padding=self.max_char_length)
            answer_raw = json_a['annotations'][idx]['multiple_choice_answer']
            answer_dense, answer_len = self.dictionary.dense_parse(
                answer_raw,
                word_padding=self.max_word_length,
                char_padding=self.max_char_length)

            # Add the class mapping
            if answer_raw not in self.class_map:
                self.class_map[answer_raw] = len(self.class_map)
            answer_class = self.class_map[answer_raw]

            # Add the image data
            feature = {
                'question_word_embedding':
                _int64_feature(np.ravel(question_dense[0]).astype(np.int64)),
                'question_char_embedding':
                _int64_feature(np.ravel(question_dense[1]).astype(np.int64)),
                'question_length':
                _int64_feature([question_len]),
                'answer_word_embedding':
                _int64_feature(np.ravel(answer_dense[0]).astype(np.int64)),
                'answer_char_embedding':
                _int64_feature(np.ravel(answer_dense[1]).astype(np.int64)),
                'answer_length':
                _int64_feature([answer_len]),
                'answer_class':
                _int64_feature([answer_class]),
                'image':
                _bytes_feature(tf.compat.as_bytes(image)),
            }

            # Write the TF-Record
            example = tf.train.Example(features=tf.train.Features(
                feature=feature))
            tf_record_writer.write(example.SerializeToString())
        tf_record_writer.close()
        DATA_STORE.update_hash(record_root)

    def _map_fn(self, serialized_example):

        # Parse the DB out from the tf_record file
        feature_dict = {
            'question_word_embedding':
            tf.FixedLenFeature([self.max_word_length], tf.int64),
            'question_char_embedding':
            tf.FixedLenFeature([self.max_word_length, self.max_char_length],
                               tf.int64),
            'question_length':
            tf.FixedLenFeature([1], tf.int64),
            'answer_word_embedding':
            tf.FixedLenFeature([self.max_word_length], tf.int64),
            'answer_char_embedding':
            tf.FixedLenFeature([self.max_word_length, self.max_char_length],
                               tf.int64),
            'answer_length':
            tf.FixedLenFeature([1], tf.int64),
            'answer_class':
            tf.FixedLenFeature([1], tf.int64),
            'image':
            tf.FixedLenFeature([], tf.string),
        }
        if self.read_codes:
            feature_dict['image_code'] = tf.FixedLenFeature(
                [self.code_shape[0] * self.code_shape[1] * self.code_shape[2]],
                tf.float32)

        features = tf.parse_single_example(serialized_example,
                                           features=feature_dict)

        image = tf.image.decode_jpeg(features['image'], channels=3)
        image = tf.cast(image, tf.float32) / 255.0
        image = tf.image.resize_images(image, self.image_resize_shape)
        image.set_shape(
            (self.image_resize_shape[0], self.image_resize_shape[1], 3))

        if self.merge_qa:
            sliced_answer = tf.slice(
                features['answer_word_embedding'], [0],
                tf.cast(features['answer_length'], tf.int32))
            sliced_question = tf.slice(
                features['question_word_embedding'], [0],
                tf.cast(features['question_length'], tf.int32))
            merged_qa = tf.concat([sliced_question, sliced_answer], axis=0)

            if self.read_codes:
                image_codes = tf.reshape(features['image_code'],
                                         self.code_shape)
                return (merged_qa, features['question_length'],
                        features['answer_length'], image, image_codes)
            return (merged_qa, features['question_length'],
                    features['answer_length'], image)

        if self.read_codes:
            image_codes = features['image_code']
            image_codes = tf.reshape(image_codes, self.code_shape)

            # This tuple is the longest, most terrible thing ever
            return (features['question_word_embedding'],
                    features['question_char_embedding'],
                    features['question_length'],
                    features['answer_word_embedding'],
                    features['answer_char_embedding'],
                    features['answer_length'], image, features['answer_class'],
                    image_codes)
        else:
            # This tuple is the longest, most terrible thing ever
            return (features['question_word_embedding'],
                    features['question_char_embedding'],
                    features['question_length'],
                    features['answer_word_embedding'],
                    features['answer_char_embedding'],
                    features['answer_length'], image, features['answer_class'])

    @property
    def train_db(self):
        if self._train_db is None:
            self._train_db = tf.data.TFRecordDataset(self.train_fpath).map(
                self._map_fn, num_parallel_calls=self.num_parallel_reads)
        return self._train_db

    @property
    def val_db(self):
        if self._val_db is None:
            self._val_db = tf.data.TFRecordDataset(self.val_fpath).map(
                self._map_fn, num_parallel_calls=self.num_parallel_reads)
        return self._val_db

    @property
    def data_format(self):
        if self.sample_triple is not None:
            return self.sample_triple
        else:
            raise NotImplementedError(
                "Sample doesn't exist.  Dataset may not be built.")

    def info(self, ) -> str:
        return (tabulate([['Num Train Examples', self.num_train_examples],
                          ['Num Val Examples', self.num_val_examples],
                          ['Word Vocab Size', self.word_vocab_size],
                          ['Char Vocab Size', self.char_vocab_size]]))