def untar(path: str) -> str: """Untar a file Arguments: path {string} -- The file to untar """ if (path.endswith("tar.gz")): log_message('Decompressing: {}'.format(path)) tar = tarfile.open(path) output_fpath = os.path.join('/'.join(path.split('/')[:-1]), path.split('/')[-1][:-7]) tar.extractall(path=output_fpath) tar.close() return output_fpath elif (path.endswith("tar")): log_message('Decompressing: {}'.format(path)) tar = tarfile.open(path, "r:") output_fpath = os.path.join('/'.join(path.split('/')[:-1]), path.split('/')[-1][:-7]) tar.extractall(path=output_fpath) tar.close() return output_fpath else: raise ValueError('Not a .tar.gz file: {}'.format(path))
def mv_r(src: str, dst: str, overwrite: bool = False) -> None: if overwrite and os.path.exists(dst): shutil.rmtree(dst) try: shutil.copytree(src, dst) except OSError as e: log_message(str(e)) return
def __init__(self, one_hot: bool = False, force_rebuild: bool = False, nohashcheck: bool = True) -> None: # Download the MNIST data self.train_images_key = maybe_download_and_store_single_file( url= 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', key='fashion_mnist/train_images') self.train_labels_key = maybe_download_and_store_single_file( url= 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz', key='fashion_mnist/train_labels') self.test_images_key = maybe_download_and_store_single_file( url= 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', key='fashion_mnist/test_images') self.test_labels_key = maybe_download_and_store_single_file( url= 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz', key='fashion_mnist/test_labels') # Build the dataset check_image_file_header(DATA_STORE[self.train_images_key]) check_labels_file_header(DATA_STORE[self.train_labels_key]) check_image_file_header(DATA_STORE[self.test_images_key]) check_labels_file_header(DATA_STORE[self.test_labels_key]) # Decode the images if not DATA_STORE.is_valid('fashion_mnist/pickle') or force_rebuild: log_message('Extracting Training Images...') self.train_images, self.train_labels = build_dataset( self.train_images_key, self.train_labels_key, one_hot) log_message('Extracting Test Images...') self.test_images, self.test_labels = build_dataset( self.test_images_key, self.test_labels_key, one_hot) pickle_dict = { 'train_im': self.train_images, 'train_lb': self.train_labels, 'test_im': self.test_images, 'test_lb': self.test_labels, } with open( DATA_STORE.create_key('fashion_mnist/pickle', 'mnist.pkl', force=True), 'wb') as pkl_file: pickle.dump(pickle_dict, pkl_file) DATA_STORE.update_hash('fashion_mnist/pickle') else: with open(DATA_STORE['fashion_mnist/pickle'], 'rb') as pkl_file: pickle_dict = pickle.load(pkl_file) self.train_images = pickle_dict['train_im'] self.test_images = pickle_dict['test_im'] self.train_labels = pickle_dict['train_lb'] self.test_labels = pickle_dict['test_lb']
def __init__(self, *args, **kwargs): super(DatastoreTestCases, self).__init__(*args, **kwargs) self.passed = False if not internet_on(): log_message("No internet. All Download Test Ignored.") self.passed = True self.datastore = DataStore(root_filepath=TMP, config_file=TMP_CONFIG, testing=True)
def __init__(self, *args, **kwargs): super(DownloadTestCases, self).__init__(*args, **kwargs) self.passed = False if not internet_on(): log_message("No internet. All Download Test Ignored.") self.passed = True self.sample_download_location = "https://www.w3.org/TR/PNG/iso_8859-1.txt" self.working_directory = DATA_STORE.working_directory self.sample_txt = os.path.join(TEST_ROOT, "test_data", "sample.txt")
def GenerateMatrix(self, dictionary: Dict[str, int]) -> np.ndarray: # Determine the length of the embedding matrix log_message('Generating Embedding Matrix...') vocab_size = len(dictionary) self.embedding_matrix = np.zeros(shape=(vocab_size, self.dimension)) for key in dictionary.keys(): self.embedding_matrix[dictionary[key]] = self._get_vec(key) return self.embedding_matrix
def build_dataset(self, train, force_rebuild=False, nohashcheck=False): record_root = 'squad/tfrecord/train' if train else 'squad/tfrecord/dev' json_data = self.train_json['data'] if train else self.dev_json['data'] num_errors = 0 num_documents = 0 if force_rebuild or not DATA_STORE.is_valid(record_root, nohashcheck=nohashcheck): log_message('Building dataset ({})...'.format('Train' if train else 'Valid')) tf_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(record_root, 'data.tfrecords',force=force_rebuild)) for article in tqdm.tqdm(json_data): for paragraph_json in article['paragraphs']: # Compute the context embedding context_tokens = self.dictionary.tokenizer.parse(paragraph_json['context'].strip().replace('\n', '')) context_dense, context_len = self.dictionary.dense_parse_tokens(context_tokens, word_padding=self.mwl, char_padding=self.mcl) # Compute the QA embeddings for question_answer in paragraph_json['qas']: question_dense, question_len = self.dictionary.dense_parse( question_answer['question'].strip().replace('\n', ''), word_padding=self.mql, char_padding=self.mcl) # For each answer for answer in question_answer['answers']: answer_dense, answer_len = self.dictionary.dense_parse( answer['text'], word_padding=self.mql, char_padding=self.mcl) # Character span start/end span_start = answer['answer_start'] span_end = span_start + len(answer['text']) # Get the token span from the char span token_span_start, token_span_end = get_token_span_from_char_span( paragraph_json['context'].strip().replace('\n', ''), context_tokens, span_start, span_end) if token_span_start < 0 or token_span_end < 0: num_errors += 1 break # Now that we've got the contents, let's make a TF-Record # We're going to handle the tf-record writing here for now # TODO: Move the tf-record writing to it's own file feature_dict = self.build_feature_dict(context_dense, question_dense, answer_dense, span_start, span_end, token_span_start, token_span_end, context_len, question_len, answer_len) example = tf.train.Example( features=tf.train.Features(feature=feature_dict)) tf_record_writer.write( example.SerializeToString()) num_documents += 1 tf_record_writer.close() DATA_STORE.update_hash(record_root) return num_documents
def save(self, fpath) -> None: log_message('Saving dictionary to: {}'.format(fpath)) with open(fpath, 'wb') as out_file: # Save the individual nec. elements save_dict = { 'wd': self.word_dictionary, 'wdr': self.word_dictionary_rev, 'cd': self.char_dictionary, 'cdr': self.char_dictionary_rev, 'dtype': self.dtype, 'tkn': self.tokenizer_string, } pickle.dump(save_dict, out_file)
def _build_dataset(self, mode="train", force_rebuild=False, nohashcheck=False): # For now, we will not use the provided vocab record_root = os.path.join(self.root_key, "tfrecord", mode) if force_rebuild or not DATA_STORE.is_valid(record_root, nohashcheck=nohashcheck): log_message('Building dataset ({})...'.format(mode)) tf_record_writer = tf.python_io.TFRecordWriter(\ DATA_STORE.create_key(record_root, 'data.tfrecords',force=force_rebuild)) if mode == "train": eng_file = self.train_eng for_file = self.train_for if mode == "test": eng_file = self.test_eng for_file = self.test_for else: eng_file = self.val_eng for_file = self.val_for with codecs.getreader("utf-8")(tf.gfile.GFile(DATA_STORE[eng_file], mode="rb")) as f: eng_data = f.read().splitlines() with codecs.getreader("utf-8")(tf.gfile.GFile(DATA_STORE[for_file], mode="rb")) as f: for_data = f.read().splitlines() for i, line in tqdm.tqdm(enumerate(eng_data)): src_dense, src_len = self.src_dictionary.dense_parse(line, \ word_padding=self.mwl, \ char_padding=0) for_line = for_data[i] for_dense, for_len = self.dst_dictionary.dense_parse(for_line, \ word_padding=self.mwl, \ char_padding=0) feature_dict = self.build_feature_dict(src_dense[0], for_dense[0], src_len, for_len) example = tf.train.Example(features=tf.train.Features( feature=feature_dict)) tf_record_writer.write(example.SerializeToString()) tf_record_writer.close() DATA_STORE.update_hash(record_root) return len(eng_data) else: return sum(1 for _ in tf.python_io.tf_record_iterator( DATA_STORE[record_root]))
def build_dataset(self, train, sample=True, force_rebuild=False, nohashcheck=False): num_tasks = 0 record_root = self.train_record_root if train else self.val_record_root record_name = "sample.tfrecords" if sample else "data.tfrecords" subset = self.subset if not train: subset = subset + "-valid" if not sample: subset = subset + "-10k" if force_rebuild: log_message('Building dataset ({})...'.format( 'Train' if train else 'Valid')) task_path = "{0}/{1}/{2}/{3}" for task in tqdm.tqdm(bAbI_20.task_list): if not train: task_id = task.split("_")[0] else: task_id = task task_tf_root = os.path.join(record_root, subset, task_id) tf_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(task_tf_root, record_name, force=force_rebuild)) task_path = task_path.format(self.task_root, self.task_root, subset, task_id) data_path = self.read_file_from_db(train, task_path) txt = self.read_txt(data_path) features = self.parse_context_question(txt) for feature_dict in features: example = tf.train.Example(\ features=tf.train.Features(feature=feature_dict)) tf_record_writer.write(example.SerializeToString()) tf_record_writer.close() DATA_STORE.update_hash(task_tf_root) num_tasks += 1 return num_tasks
def _build_dataset(self, dataset: str) -> None: if dataset not in ['train', 'val']: raise ValueError("Must be building either training or validation dataset") # Open the TFRecordWriter if dataset == 'train': record_root = 'coco2014/tfrecord/train' json = self.train_json root_fpath = DATA_STORE['coco2014/data/train/images'] else: record_root = 'coco2014/tfrecord/val' json = self.val_json root_fpath = DATA_STORE['coco2014/data/val/images'] # Construct the record reader tf_record_writer = tf.python_io.TFRecordWriter(DATA_STORE.create_key(record_root, 'data.tfrecords', force=True)) # Loop over the data and parse errors = 0 log_message('Building {} dataset...'.format(dataset)) for entry in tqdm.tqdm(json['annotations']): # Load the image image = load_image(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset)) if image is None: errors += 1 log_warning('Error loading image: {}. {} Errors so far.'.format(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset), errors)) continue # Parse the caption caption_raw = entry['caption'] caption_dense, caption_len = self.dictionary.dense_parse(caption_raw, word_padding=self.max_word_length, char_padding=self.max_char_length) # Add the image data feature = { 'caption_word_embedding': _int64_feature(np.ravel(caption_dense[0]).astype(np.int64)), 'caption_char_embedding': _int64_feature(np.ravel(caption_dense[1]).astype(np.int64)), 'caption_length': _int64_feature([caption_len]), 'image_shape': _int64_feature(image.shape), 'image': _bytes_feature(tf.compat.as_bytes(image.tostring())), } # Write the TF-Record example = tf.train.Example(features=tf.train.Features(feature=feature)) tf_record_writer.write(example.SerializeToString()) tf_record_writer.close() DATA_STORE.update_hash(record_root)
def _build_dataset(self, dataset: str) -> None: if dataset not in ['train', 'val']: raise ValueError("Must be building either training or validation dataset") # Open the TFRecordWriter if dataset == 'train': record_root = 'coco2017/detection/tfrecord/train' json = self.train_json root_fpath = DATA_STORE['coco2017/data/train/images'] else: record_root = 'coco2017/detection/tfrecord/val' json = self.val_json root_fpath = DATA_STORE['coco2017/data/val/images'] # Construct the record reader tf_record_writer = tf.python_io.TFRecordWriter(DATA_STORE.create_key(record_root, 'data.tfrecords', force=True)) # Loop over the data and parse errors = 0 log_message('Building {} dataset...'.format(dataset)) for entry in tqdm.tqdm(json['annotations']): # Load the image image = load_image(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset)) if image is None: errors += 1 log_warning('Error loading image: {}. {} Errors so far.'.format(build_fpath_from_image_id(root_fpath, entry['image_id'], dataset), errors)) continue # Add the image data # TODO: Add the segmentation (decode using the RLE for COCO) feature = { 'area': _float_feature(entry['area']), 'iscrowd': _int64_feature(entry['iscrowd']), 'bbox': _float_feature(np.ravel(np.array(entry['bbox'], dtype=np.float32))), 'category_id': _int64_feature(entry['category_id']), 'image_shape': _int64_feature(image.shape), 'image': _bytes_feature(tf.compat.as_bytes(image.tostring())), } # Write the TF-Record example = tf.train.Example(features=tf.train.Features(feature=feature)) tf_record_writer.write(example.SerializeToString()) tf_record_writer.close() DATA_STORE.update_hash(record_root)
def retrack_config(reset=False): global CURR_CONFIG, DATA_STORE, CONFIG_FILE all_configs = os.listdir(CONFIG_FOLDER) all_configs.sort( key=lambda x: datetime.strptime(x.split(".")[0], '%Y-%m-%d'), reversed=True) if reset: CURR_CONFIG = get_var('FLUX_CONFIG', 'flux_config.json') CONFIG_FILE = CURR_CONFIG else: index = all_configs.index(CURR_CONFIG) + 1 if index < len(all_configs): CURR_CONFIG = all_configs[index] else: log_message("Reached the earliest config file.") CONFIG_FILE = os.path.join(CONFIG_FOLDER, CURR_CONFIG) DATA_STORE = DataStore(root_filepath=ROOT_FPATH, config_file=CONFIG_FILE)
def unzip(path: str, opt_dir: str = None) -> str: """Unzip a file in the current directory Arguments: path {str} -- The path to the file to unzip """ if (path.endswith('.zip')): log_message('Decompressing: {}'.format(path)) zip_ref = zipfile.ZipFile(path, 'r') if opt_dir is None: output_fpath = os.path.join('/'.join(path.split('/')[:-1]), path.split('/')[-1][:-4]) else: output_fpath = opt_dir zip_ref.extractall(path=output_fpath) zip_ref.close() return output_fpath else: raise ValueError('Not a .zip file: {}'.format(path))
def __init__(self, num_parallel_reads: int = 1, force_build=False, force_download=False, shuffle=True): file_pair = { "img_align_celeba.zip": "0B7EVK8r0v71pZjFTYXZWM3FlRnM", "list_attr_celeba.txt": "0B7EVK8r0v71pblRyaVFSWGxPY0U" } self.root_key = "celebA" self.num_attr = 40 log_message("Retrieving CelebA data") self.keys = maybe_download_and_store_google_drive( file_pair, root_key=self.root_key, force_download=force_download, use_subkeys=False) if len(self.keys) == 0: log_warning("Download Failed, change force_download=True") return self.selected_attrs = None # Extract each batch log_message('Extracting CelebA data...') self._train_db = None self._val_db = None self.num_parallel_reads = num_parallel_reads # Extract labels self.attr2idx: Dict = {} self.idx2attr: Dict = {} log_message("Extracting CelebA labels first") info_files = DATA_STORE[self.keys[1]] self._process_attr(info_files) if force_build: if shuffle: random.shuffle(self._img_meta) # Build Dataset self._build_dataset("train", shuffle) self._build_dataset("val", shuffle) record_root = os.path.join(self.root_key, "tfrecord") train_root = os.path.join(record_root, "train") val_root = os.path.join(record_root, "val") self.train_fpath = DATA_STORE[train_root] self.val_fpath = DATA_STORE[val_root] self.num_train_examples = sum( 1 for _ in tf.python_io.tf_record_iterator(self.train_fpath)) self.num_val_examples = sum( 1 for _ in tf.python_io.tf_record_iterator(self.train_fpath)) log_message("Built Complete")
def maybe_download(file_name: str, source_url: str, work_directory: str, postprocess=None, username: str=None, password: str=None): """Download a file from source-url to the work directory as file_name if the file does not already exist Arguments: file_name {str} -- The name of the file to save the url download as source_url {str} -- The URL to download from work_directory {str} -- The directory to download to """ # Create the work directory if it doesn't already exist if not os.path.exists(work_directory): mkdir_p(work_directory) # Check if the file-exists, if not, retrieve it filepath = os.path.join(work_directory, file_name) if not os.path.exists(filepath): log_message('Downloading {} from {}, please wait...'.format( file_name, source_url)) with TqdmUpTo(unit='B', unit_scale=True, miniters=1) as t: # Create a mock browser if username is not None: if password is None: raise ValueError('If using authentication, provide both a username and password.') manager = urllib.request.HTTPPasswordMgrWithDefaultRealm() manager.add_password(None, source_url, username, password) auth = urllib.request.HTTPBasicAuthHandler(manager) opener = urllib.request.build_opener(auth) else: opener = urllib.request.build_opener() opener.addheaders = MOCK_BROWSER_HEADER urllib.request.install_opener(opener) filepath, _ = urllib.request.urlretrieve(source_url, filepath, t.update_to) stat_info = os.stat(filepath) log_message('Successfully downloaded {} ({} bytes).'.format( file_name, stat_info.st_size)) if postprocess is not None: filepath = postprocess(filepath) # If the file exists, then we should return folder rather than *.zip return filepath
def maybe_download_and_store_google_drive(file_pair: Dict[str, str], root_key: str, description: str = None, force_download: bool = False, use_subkeys=True, **kwargs) -> List[str]: old_keys: List[str] = [] if not force_download and DATA_STORE.is_valid( root_key) and validate_subkeys(root_key, old_keys): return old_keys keys = [] DATA_STORE.create_key(root_key, 'root.key', force=True) for file_name in file_pair: log_message("Downloading " + file_name) file_id = file_pair[file_name] file_dest = os.path.join(DATA_STORE.working_directory, file_name) data_path = maybe_download_google_drive(file_id, file_dest, force_download=force_download) data_path = post_process(data_path) log_message("Decompressed " + file_name + "to " + data_path) if os.path.isdir(data_path): if use_subkeys: _keys = register_to_datastore(data_path, root_key, description) keys.extend(_keys) else: data_key = os.path.join(root_key, file_name.split(".zip")[0]) DATA_STORE.add_folder(data_key, data_path, force=True) keys.append(data_key) else: _key = os.path.join(root_key, file_name.split(".")[0]) DATA_STORE.add_file(_key, data_path, description, force=True) keys.append(_key) log_message("Completed " + file_name) DATA_STORE.create_key(root_key, 'root.key', force=True) return [k for k in keys] + [root_key]
def __init__(self, force_rebuild: bool = False, nohashcheck: bool = True) -> None: file = "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar" self._classes = ( '__background__', # always index 0 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor') self.num_classes = 21 self._class_to_ind = dict(zip(self._classes, range(self.num_classes))) self.max_num_obj = 50 self.voc_root_key = "pascal/voc/2012" self.file_structure = os.path.join("VOCtrainval_11-May-2", "VOCdevkit", "VOC2012") work_file_path = os.path.join(DATA_STORE.working_directory, self.file_structure) _annotation_path = os.path.join(work_file_path, "Annotations") _problems = os.path.join(work_file_path, "ImageSets") _images = os.path.join(work_file_path, "JPEGImages") _segmentation_class = os.path.join(work_file_path, "SegmentationClass") _segmentation_object = os.path.join(work_file_path, "SegmentationObject") self.annotation_key = os.path.join(self.voc_root_key, "annotations") self.images_key = os.path.join(self.voc_root_key, "images") self.segmentation_key = os.path.join(self.voc_root_key, "segmentation", "class") self.segmentation_obj_key = os.path.join(self.voc_root_key, "segmentation", "obj") if force_rebuild: log_message("Copying data to destination folder in flux") maybe_download_and_store_tar(url=file, root_key='pascal/voc/2012', use_subkeys=False) DATA_STORE.add_folder(self.images_key, _images) DATA_STORE.add_folder(self.segmentation_key, _segmentation_class) DATA_STORE.add_folder(self.segmentation_obj, _segmentation_object) DATA_STORE.add_folder(self.annotation_key, _annotation_path) self.problems_key = retrieve_subkeys(self.voc_root_key) if len(self.problems_key) < 1: log_message("Building Problem Keys") self.problems_key = register_to_datastore(_problems, self.voc_root_key, "") self.problems_key = [ os.path.join(self.voc_root_key, key) for key in self.problems_key ] self.image_path = DATA_STORE[self.images_key] self.annotation_path = DATA_STORE[self.annotation_key] self.seg_class_path = DATA_STORE[self.segmentation_key] self.seg_obj_path = DATA_STORE[self.segmentation_obj_key]
def _build_dataset(self, dataset): _problem_key = [p for p in self.problems if p.endswith(dataset)] if len(_problem_key) < 1: log_warning("Problem key doesn't exist for {}. ".format(dataset) + str(_problem_key)) raise EnvironmentError() problem_key = _problem_key[0] tf_record_key = os.path.join(self.voc_root_key, self.problem_name.lower(), "tfrecord", dataset) log_message("Retrieving the index from " + problem_key) assert (os.path.exists(DATA_STORE[problem_key])) with open(DATA_STORE[problem_key], 'r') as f: images_index = [x.strip() for x in f.readlines()] tf_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(tf_record_key, 'data.tfrecords', force=True)) errors = 0 log_message("Building {} dataset...".format(dataset)) total_num_examples = 0 for idx, index in tqdm(enumerate(images_index)): img_path = image_path_from_index(index, self.image_path, '.jpg') feature_dict = self._load_pascal_annotation(index) image = load_image(img_path) image = encode_jpeg(image) if image is None: errors += 1 log_warning( 'Error loading image: {}. {} Errors so far.'.format( img_path, errors)) continue seg_cls_path = image_path_from_index(index, self.seg_class_path, '.png') seg_class = load_image(seg_cls_path) seg_class = encode_png(seg_class) seg_obj_path = image_path_from_index(index, self.seg_obj_path, '.png') seg_obj = load_image(seg_obj_path) seg_obj = encode_png(seg_obj) if seg_class is None: errors += 1 log_warning( 'Error loading image: {}. {} Errors so far.'.format( seg_cls_path, errors)) continue if seg_obj is None: errors += 1 log_warning( 'Error loading image: {}. {} Errors so far.'.format( seg_obj_path, errors)) continue feature_dict["image"] = _bytes_feature(tf.compat.as_bytes(image)) feature_dict["seg_class"] = _bytes_feature( tf.compat.as_bytes(seg_class)) feature_dict["seg_obj"] = _bytes_feature( tf.compat.as_bytes(seg_obj)) example = tf.train.Example(features=tf.train.Features( feature=feature_dict)) tf_record_writer.write(example.SerializeToString()) total_num_examples += 1 tf_record_writer.close() DATA_STORE.update_hash(tf_record_key) return total_num_examples
def _build_dataset(self, dataset: str, shuffle: bool) -> None: if dataset not in ['train', 'val']: raise ValueError( "Must be building either training or validation dataset") record_root = os.path.join(self.root_key, "tfrecord") # Open the TFRecordWriter if dataset == 'train': record_root = os.path.join(record_root, "train") data_size = self._num_examples * TRAIN_PARTITION else: record_root = os.path.join(record_root, "val") data_size = self._num_examples * VAL_PARTITION # Construct the record reader tf_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key( record_root, 'shuffle.tfrecords' if shuffle else "data.tfrecords", force=True)) # Loop over the data and parse errors = 0 log_message('Building {} dataset...'.format(dataset)) img_path = DATA_STORE[self.keys[0]] for i in tqdm.tqdm(range(int(data_size))): img_meta = self._img_meta[i].strip("\n").split(" ") file_name = os.path.join(img_path, img_meta[0]) values = img_meta[1:] label = [] for attr_name in self.selected_attrs: idx = self.attr2idx[attr_name] if values[idx] == '1': label.append(1.0) else: label.append(0.0) assert (len(label) == self.num_attr ) # All labels should have 40 items. (One hot) label = np.array(label, dtype=np.float32) # Load the image image = load_image(file_name) if image is None: errors += 1 log_warning( 'Error loading image: {}. {} Errors so far.'.format( file_name, errors)) continue # Add the image data feature = { "label": _float_feature(label), 'image_shape': _int64_feature(image.shape), 'image': _bytes_feature(tf.compat.as_bytes(image.tostring())), } # Write the TF-Record example = tf.train.Example(features=tf.train.Features( feature=feature)) tf_record_writer.write(example.SerializeToString()) tf_record_writer.close() DATA_STORE.update_hash(record_root)
def _build_dataset(self, ) -> None: # Define the Record Root # Open the TFRecordWriter train_record_root = os.path.join(self.train_fpath, "data") val_record_root = os.path.join(self.val_fpath, "data") test_record_root = os.path.join(self.test_fpath, "data") # Construct the record reader train_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(train_record_root, 'data.tfrecords', force=True)) val_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(val_record_root, 'data.tfrecords', force=True)) test_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(test_record_root, 'data.tfrecords', force=True)) # Loop over the data and parse errors = 0 log_message('Building the dataset...') images = self._json['images'] if self.data_type == "pointing": boxes = self._json['boxes'] boxes_dict = {d["box_id"]: d for d in boxes} total_num_examples = len(images) for idx, entry in tqdm.tqdm(enumerate(images), total=total_num_examples): # Load the image # Split the dataset split = entry["split"] if split == "val": tf_record_writer = val_record_writer self.num_val_examples += 1 elif split == "test": tf_record_writer = test_record_writer self.num_test_examples += 1 else: tf_record_writer = train_record_writer self.num_train_examples += 1 image_id = entry['image_id'] qa_pairs = entry['qa_pairs'] for qa in qa_pairs: question_raw = qa['question'] question_type = qa['type'] qa_id = qa['qa_id'] mlt_choice = qa["multiple_choices"] answer = qa['answer'] assert len(mlt_choice) == 3 question_dense, question_len = self.dictionary.dense_parse( question_raw, word_padding=self.max_word_length, char_padding=self.max_char_length) if self.data_type == "telling": answer_dense, answer_len = self.dictionary.dense_parse( answer, word_padding=self.max_word_length, char_padding=self.max_char_length) m1_dense, m1_len = self.dictionary.dense_parse( mlt_choice[0], word_padding=self.max_word_length, char_padding=self.max_char_length) m2_dense, m2_len = self.dictionary.dense_parse( mlt_choice[1], word_padding=self.max_word_length, char_padding=self.max_char_length) m3_dense, m3_len = self.dictionary.dense_parse( mlt_choice[2], word_padding=self.max_word_length, char_padding=self.max_char_length) # Add the image data feature = { 'question_word_embedding': _int64_feature( np.ravel(question_dense[0]).astype(np.int64)), 'question_char_embedding': _int64_feature( np.ravel(question_dense[1]).astype(np.int64)), 'question_length': _int64_feature([question_len]), 'ans_word_embedding': _int64_feature( np.ravel(answer_dense[0]).astype(np.int64)), 'ans_char_embedding': _int64_feature( np.ravel(answer_dense[1]).astype(np.int64)), 'ans_length': _int64_feature([answer_len]), 'm1_embedding': _int64_feature(np.ravel(m1_dense[0]).astype(np.int64)), 'm1_char_embedding': _int64_feature(np.ravel(m1_dense[1]).astype(np.int64)), 'm2_embedding': _int64_feature(np.ravel(m2_dense[0]).astype(np.int64)), 'm2_char_embedding': _int64_feature(np.ravel(m2_dense[1]).astype(np.int64)), 'm3_embedding': _int64_feature(np.ravel(m3_dense[0]).astype(np.int64)), 'm3_char_embedding': _int64_feature(np.ravel(m3_dense[1]).astype(np.int64)), 'mc_len': _int64_feature([m1_len, m2_len, m3_len]), "q_type": _bytes_feature(tf.compat.as_bytes(question_type)), 'qa_id': _int64_feature([qa_id]), 'image_id': _int64_feature([image_id]), } else: answer_loc, answer_dense, answer_len = self.get_boxes( answer, boxes_dict) m1_loc, m1_dense, m1_len = self.get_boxes( mlt_choice[0], boxes_dict) m2_loc, m2_dense, m2_len = self.get_boxes( mlt_choice[1], boxes_dict) m3_loc, m3_dense, m3_len = self.get_boxes( mlt_choice[2], boxes_dict) coord = answer_loc + m1_loc + m2_loc + m3_loc # Add the image data feature = { 'question_word_embedding': _int64_feature( np.ravel(question_dense[0]).astype(np.int64)), 'question_char_embedding': _int64_feature( np.ravel(question_dense[1]).astype(np.int64)), 'question_length': _int64_feature([question_len]), 'ans_word_embedding': _int64_feature( np.ravel(answer_dense[0]).astype(np.int64)), 'ans_char_embedding': _int64_feature( np.ravel(answer_dense[1]).astype(np.int64)), 'ans_length': _int64_feature([answer_len]), "coordinate": _int64_feature(coord), 'm1_embedding': _int64_feature(np.ravel(m1_dense[0]).astype(np.int64)), 'm1_char_embedding': _int64_feature(np.ravel(m1_dense[1]).astype(np.int64)), 'm2_embedding': _int64_feature(np.ravel(m2_dense[0]).astype(np.int64)), 'm2_char_embedding': _int64_feature(np.ravel(m2_dense[1]).astype(np.int64)), 'm3_embedding': _int64_feature(np.ravel(m3_dense[0]).astype(np.int64)), 'm3_char_embedding': _int64_feature(np.ravel(m3_dense[1]).astype(np.int64)), 'mc_len': _int64_feature([m1_len, m2_len, m3_len]), 'qa_id': _int64_feature([qa_id]), "q_type": _bytes_feature(tf.compat.as_bytes(question_type)), 'image_id': _int64_feature([image_id]), } example = tf.train.Example(features=tf.train.Features( feature=feature)) tf_record_writer.write(example.SerializeToString()) val_record_writer.close() train_record_writer.close() test_record_writer.close() DATA_STORE.update_hash(test_record_root) DATA_STORE.update_hash(train_record_root) DATA_STORE.update_hash(val_record_root)
def __init__(self, version='0.3', num_parallel_reads: Optional[int]=None, force_rebuild: bool=False, mask: bool=False, add_start_tokens: bool=False, add_stop_tokens: bool=False, use_qam: bool=False) -> None: self.version = version self.num_parallel_reads = num_parallel_reads self.mask = mask self.add_start_tokens = add_start_tokens self.add_stop_tokens = add_stop_tokens self.use_qam = use_qam # We keep one copy of masked data, and one copy of unmasked data if self.mask: self.stem = 'newslens/masked/' else: self.stem = 'newslens/' # We don't use the stem here, because the json files are the same if self.version == '0.1': # Download the training data self.json_key = maybe_download_and_store_single_file( url='https://newslens.berkeley.edu/QA_dataset0.1.json', key='newslens/json_0.1') self.mwl = 766 self.mcl = 37 self.mql = 766 elif self.version == '0.2': # Download the training data self.json_key = maybe_download_and_store_single_file( url='https://newslens.berkeley.edu/QA_dataset0.2.json', key='newslens/json_0.2') self.mwl = 595 self.mcl = 16 self.mql = 766 elif self.version == '0.3': # Download the training data self.json_key = maybe_download_and_store_single_file( url='https://newslens.berkeley.edu/QA_dataset0.3.json', key='newslens/json_0.3') self.mwl = 600 self.mcl = 16 self.mql = 20 else: raise ValueError("Invalid version for NLQA dataset") # Read the JSON with open(DATA_STORE[self.json_key], 'r') as json_file: self.json = json.loads(json_file.read()) # Parse the JSON if not force_rebuild and DATA_STORE.is_valid(self.stem + 'dictionary_{}'.format(self.version)): with open(DATA_STORE[self.stem + 'dictionary_{}'.format(self.version)], 'rb') as pkl_file: self.dictionary = pickle.load(pkl_file) else: self.dictionary = NLPDictionary(tokenizer='space', dtype=np.int32) # If the tf-records don't exist, build them if force_rebuild or not DATA_STORE.is_valid(self.stem + 'tfrecord/train/data_{}'.format(self.version)) or not DATA_STORE.is_valid(self.stem + 'tfrecord/val/data_{}'.format(self.version)): log_message('Building dataset...') # Create the tf-record writer train_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(self.stem + 'tfrecord/train/data_{}'.format(self.version), 'data.tfrecords', force=force_rebuild)) val_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(self.stem + 'tfrecord/val/data_{}'.format(self.version), 'data.tfrecords', force=force_rebuild)) # Parse the data into tf-records for record in tqdm.tqdm(self.json): # Handle start and stop tokens on the answer if self.add_stop_tokens: if self.mask: answer_text = record['masked_answer'].strip() + ' <STOP>' else: answer_text = record['real_answer'].strip() + ' <STOP>' else: if self.mask: answer_text = record['masked_answer'] else: answer_text = record['real_answer'] if self.add_start_tokens: answer_text = '<START> ' + answer_text if not self.add_stop_tokens: question_answer_dense, qa_len = self.dictionary.dense_parse(record['question'].strip() + ' ' + answer_text.strip() + '<STOP>', word_padding=self.mwl, char_padding=self.mcl) else: question_answer_dense, qa_len = self.dictionary.dense_parse(record['question'].strip() + ' ' + answer_text.strip(), word_padding=self.mwl, char_padding=self.mcl) if self.mask: tokens = record['masked_document'].split(' ') context_dense, context_len = self.dictionary.dense_parse(record['masked_document'], word_padding=self.mwl, char_padding=self.mcl) label = record['masked_answer'].split(' ') else: tokens = record['unmasked_document'].split(' ') context_dense, context_len = self.dictionary.dense_parse(record['unmasked_document'], word_padding=self.mwl, char_padding=self.mcl) label = record['real_answer'].split(' ') answer_dense, answer_len = self.dictionary.dense_parse(answer_text, word_padding=self.mql, char_padding=self.mcl) question_dense, question_len = self.dictionary.dense_parse(record['question'], word_padding=self.mql, char_padding=self.mcl) # Here's a bit of logic to parse out the tokens properly potential_starts = [x for x in range(len(tokens)) if tokens[x] == label[0]] label_index_start: List[int] = [] label_index_end: List[int] = [] for i in potential_starts: idx = [x for x in range( i, len(tokens)) if tokens[x] == label[-1]] if len(idx) > 0: label_index_start.append(i) label_index_end.append(idx[0]) label_indices = zip(label_index_start, label_index_end) if np.random.random() < 0.95: val = False else: val = True for l_ind in label_indices: # Built the dataset/tf-records feature_dict = {} feature_dict['context_word_embedding'] = tf.train.Feature( int64_list=tf.train.Int64List(value=np.ravel(context_dense[0]))) feature_dict['context_char_embedding'] = tf.train.Feature( int64_list=tf.train.Int64List(value=np.ravel(context_dense[1]))) feature_dict['question_word_embedding'] = tf.train.Feature( int64_list=tf.train.Int64List(value=np.ravel(question_dense[0]))) feature_dict['question_char_embedding'] = tf.train.Feature( int64_list=tf.train.Int64List(value=np.ravel(question_dense[1]))) feature_dict['answer_word_embedding'] = tf.train.Feature( int64_list=tf.train.Int64List(value=np.ravel(answer_dense[0]))) feature_dict['question_answer_word_embedding'] = tf.train.Feature( int64_list=tf.train.Int64List(value=np.ravel(question_answer_dense[0]))) feature_dict['word_maxlen'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[self.mwl])) feature_dict['char_maxlen'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[self.mcl])) feature_dict['token_label_start'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[l_ind[0]])) feature_dict['token_label_end'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[l_ind[1]])) feature_dict['context_word_len'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[context_len])) feature_dict['question_word_len'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[question_len])) feature_dict['question_answer_word_len'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[qa_len])) feature_dict['answer_word_len'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[answer_len])) example = tf.train.Example( features=tf.train.Features(feature=feature_dict)) if val: val_record_writer.write( example.SerializeToString()) else: train_record_writer.write( example.SerializeToString()) train_record_writer.close() val_record_writer.close() DATA_STORE.update_hash( self.stem + 'tfrecord/train/data_{}'.format(self.version)) DATA_STORE.update_hash( self.stem + 'tfrecord/val/data_{}'.format(self.version)) # Save the dictionary with open(DATA_STORE.create_key(self.stem + 'dictionary_{}'.format(self.version), 'dict.pkl', force=True), 'wb') as pkl_file: pickle.dump(self.dictionary, pkl_file) DATA_STORE.update_hash( self.stem + 'dictionary_{}'.format(self.version)) # Compute the number of training examples in the document self.num_val_examples = sum( 1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.stem + 'tfrecord/val/data_{}'.format(self.version)])) self.num_train_examples = sum( 1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[self.stem + 'tfrecord/train/data_{}'.format(self.version)])) self.word_vocab_size = len(self.dictionary.word_dictionary) self.char_vocab_size = len(self.dictionary.char_dictionary) self._dev_db = None self._train_db = None
def __init__(self, num_parallel_reads: int = 1, force_rebuild: bool = False, ignore_hashes=False, image_shape: Sequence[int] = [224, 224], read_codes=False, code_shape: Sequence[int] = [7, 7, 2048], merge_qa=False) -> None: self.image_resize_shape = image_shape self.read_codes = read_codes self.code_shape = code_shape self.merge_qa = merge_qa # Get all of the necessary data self.train_a_json_key = maybe_download_and_store_zip( 'http://visualqa.org/data/mscoco/vqa/v2_Annotations_Train_mscoco.zip', 'coco2014/data/train/annotations')[0] self.val_a_json_key = maybe_download_and_store_zip( 'http://visualqa.org/data/mscoco/vqa/v2_Annotations_Val_mscoco.zip', 'coco2014/data/val/annotations')[0] self.train_q_json_key = maybe_download_and_store_zip( 'http://visualqa.org/data/mscoco/vqa/v2_Questions_Train_mscoco.zip', 'coco2014/data/train/questions')[0] self.val_q_json_key = maybe_download_and_store_zip( 'http://visualqa.org/data/mscoco/vqa/v2_Questions_Val_mscoco.zip', 'coco2014/data/val/questions')[0] maybe_download_and_store_zip( 'http://images.cocodataset.org/zips/train2014.zip', 'coco2014/data/train/images', use_subkeys=False) maybe_download_and_store_zip( 'http://images.cocodataset.org/zips/val2014.zip', 'coco2014/data/val/images', use_subkeys=False) # Compute the size of the datasets self.num_train_examples = 443757 self.num_val_examples = 214654 self.num_classes = 29332 # Now that we have the data, load and parse the JSON files need_rebuild_train = force_rebuild if not ignore_hashes and ( need_rebuild_train or not DATA_STORE.is_valid('vqa/tfrecord/train')): log_message( 'Need to rebuild training data. Loading JSON annotations.') need_rebuild_train = True with open(DATA_STORE[self.train_a_json_key], 'r') as annotation_file: self.train_a_json = json.loads(annotation_file.read()) with open(DATA_STORE[self.train_q_json_key], 'r') as annotation_file: self.train_q_json = json.loads(annotation_file.read()) need_rebuild_val = force_rebuild if not ignore_hashes and (need_rebuild_val or not DATA_STORE.is_valid('vqa/tfrecord/val')): log_message( 'Need to rebuild validation data. Loading JSON annotations.') need_rebuild_val = True with open(DATA_STORE[self.val_a_json_key], 'r') as annotation_file: self.val_a_json = json.loads(annotation_file.read()) with open(DATA_STORE[self.val_q_json_key], 'r') as annotation_file: self.val_q_json = json.loads(annotation_file.read()) # Load the vocab files if not ignore_hashes and (force_rebuild or not DATA_STORE.is_valid('vqa/dictionary')): self.dictionary = NLPDictionary() need_rebuild_train = True need_rebuild_val = True else: with open(DATA_STORE['vqa/dictionary'], 'rb') as dict_file: self.dictionary = pickle.load(dict_file) if not ignore_hashes and (force_rebuild or not DATA_STORE.is_valid('vqa/class_map')): self.class_map: Dict[str, int] = {} need_rebuild_train = True need_rebuild_val = True else: with open(DATA_STORE['vqa/class_map'], 'rb') as class_map_file: self.class_map = pickle.load(class_map_file) # Setup some default options for the dataset self.max_word_length = 50 self.max_char_length = 16 self._val_db = None self._train_db = None self.num_parallel_reads = num_parallel_reads # Build the tfrecord dataset from the JSON if need_rebuild_train: self._build_dataset('train') if need_rebuild_val: self._build_dataset('val') self.train_fpath = DATA_STORE['vqa/tfrecord/train'] self.val_fpath = DATA_STORE['vqa/tfrecord/val'] # Save the vocab with open( DATA_STORE.create_key('vqa/dictionary', 'dict.pkl', force=True), 'wb') as pkl_file: pickle.dump(self.dictionary, pkl_file) DATA_STORE.update_hash('vqa/dictionary') with open( DATA_STORE.create_key('vqa/class_map', 'class_map.pkl', force=True), 'wb') as pkl_file: pickle.dump(self.class_map, pkl_file) DATA_STORE.update_hash('vqa/class_map') self.word_vocab_size = len(self.dictionary.word_dictionary) self.char_vocab_size = len(self.dictionary.char_dictionary)
def __init__(self, version: str='wikipedia', dimension: int=300) -> None: self.version = version self.dimension = dimension self.embedding_matrix: Optional[np.ndarray] = None if self.version == 'wikipedia': # Make sure that the dimension is valid if self.dimension not in GloveEmbedding.wikipedia_dimensions: raise ValueError('Error: Invalid GLoVe dimension ({}) for Wikipedia dataset. Must be one of {}'.format(self.dimension, GloveEmbedding.wikipedia_dimensions)) if not DATA_STORE.is_valid('glove/wikipedia/dim{}'.format(self.dimension)): # Download the file into the working direcotry maybe_download(file_name='glove.6B.zip', source_url='http://nlp.stanford.edu/data/glove.6B.zip', work_directory=DATA_STORE.working_directory, postprocess=unzip) # Read the data keys from the file log_message('Loading vectors...') self.encoder: Dict[str, np.ndarray] = {} with open(os.path.join(DATA_STORE.working_directory, 'glove.6B/glove.6B.{}d.txt'.format(self.dimension)), 'r') as glove_file: for line in glove_file: tokens = line.split() self.encoder[tokens[0]] = np.array( [float(x) for x in tokens[1:]]) # Save the encoder with open(DATA_STORE.create_key('glove/wikipedia/dim{}'.format(self.dimension), 'encoder.pkl', force=True), 'wb',) as pkl_file: pickle.dump(self.encoder, pkl_file) DATA_STORE.update_hash('glove/wikipedia/dim{}'.format(self.dimension)) else: with open(DATA_STORE['glove/wikipedia/dim{}'.format(self.dimension)], 'rb') as pkl_file: self.encoder = pickle.load(pkl_file) elif self.version == 'common-small': # Make sure that the dimension is valid if self.dimension not in GloveEmbedding.common_small_dimensions: raise ValueError('Error: Invalid GLoVe dimension ({}) for Common-Crawl Small dataset. Must be one of {}'.format(self.dimension, GloveEmbedding.common_small_dimensions)) if not DATA_STORE.is_valid('glove/common-small/dim{}'.format(self.dimension)): # Download the file into the working direcotry maybe_download(file_name='glove.42B.300d.zip', source_url='http://nlp.stanford.edu/data/glove.42B.300d.zip', work_directory=DATA_STORE.working_directory, postprocess=unzip) # Read the data keys from the file log_message('Loading vectors...') self.encoder: Dict[str, np.ndarray] = {} with open(os.path.join(DATA_STORE.working_directory, 'glove.42B.300d/glove.42B.{}d.txt'.format(self.dimension)), 'r') as glove_file: for line in glove_file: tokens = line.split() self.encoder[tokens[0]] = np.array( [float(x) for x in tokens[1:]]) # Save the encoder with open(DATA_STORE.create_key('glove/common-small/dim{}'.format(self.dimension), 'encoder.pkl', force=True), 'wb') as pkl_file: pickle.dump(self.encoder, pkl_file) DATA_STORE.update_hash('glove/common-small/dim{}'.format(self.dimension)) else: with open(DATA_STORE['glove/common-small/dim{}'.format(self.dimension)], 'rb') as pkl_file: self.encoder = pickle.load(pkl_file) elif self.version == 'common-large': # Make sure that the dimension is valid if self.dimension not in GloveEmbedding.common_large_dimensions: raise ValueError('Error: Invalid GLoVe dimension ({}) for Common-Crawl Large dataset. Must be one of {}'.format(self.dimension, GloveEmbedding.common_large_dimensions)) if not DATA_STORE.is_valid('glove/common-large/dim{}'.format(self.dimension)): # Download the file into the working direcotry maybe_download(file_name='glove.840B.300d.zip', source_url='http://nlp.stanford.edu/data/glove.840B.300d.zip', work_directory=DATA_STORE.working_directory, postprocess=unzip) # Read the data keys from the file log_message('Loading vectors...') self.encoder: Dict[str, np.ndarray] = {} with open(os.path.join(DATA_STORE.working_directory, 'glove.840B.300d/glove.840B.{}d.txt'.format(self.dimension)), 'r') as glove_file: for line in glove_file: tokens = line.split() self.encoder[tokens[0]] = np.array( [float(x) for x in tokens[1:]]) # Save the encoder with open(DATA_STORE.create_key('glove/common-large/dim{}'.format(self.dimension), 'encoder.pkl', force=True), 'wb') as pkl_file: pickle.dump(self.encoder, pkl_file) DATA_STORE.update_hash('glove/common-large/dim{}'.format(self.dimension)) else: with open(DATA_STORE['glove/common-large/dim{}'.format(self.dimension)], 'rb') as pkl_file: self.encoder = pickle.load(pkl_file) elif self.version == 'twitter': # Make sure that the dimension is valid if self.dimension not in GloveEmbedding.twitter_dimensions: raise ValueError('Error: Invalid GLoVe dimension ({}) for Common-Crawl Large dataset. Must be one of {}'.format(self.dimension, GloveEmbedding.twitter_dimensions)) if not DATA_STORE.is_valid('glove/twitter/dim{}'.format(self.dimension)): # Download the file into the working direcotry maybe_download(file_name='glove.twitter.27B.zip', source_url='http://nlp.stanford.edu/data/glove.twitter.27B.zip', work_directory=DATA_STORE.working_directory, postprocess=unzip) # Read the data keys from the file log_message('Loading vectors...') self.encoder: Dict[str, np.ndarray] = {} with open(os.path.join(DATA_STORE.working_directory, 'glove.twitter.27B/glove.twitter.27B.{}d.txt'.format(self.dimension)), 'r') as glove_file: for line in glove_file: tokens = line.split() self.encoder[tokens[0]] = np.array( [float(x) for x in tokens[1:]]) # Save the encoder with open(DATA_STORE.create_key('glove/twitter/dim{}'.format(self.dimension), 'encoder.pkl', force=True), 'wb') as pkl_file: pickle.dump(self.encoder, pkl_file) DATA_STORE.update_hash('glove/twitter/dim{}'.format(self.dimension)) else: with open(DATA_STORE['glove/twitter/dim{}'.format(self.dimension)], 'rb') as pkl_file: self.encoder = pickle.load(pkl_file) else: raise ValueError('Error: Invalid GLoVe Version: {}, Must be one of {}'.format( version, GloveEmbedding.valid_versions))
def __init__(self, data_type="pointing", num_parallel_reads: int = 1, force_rebuild: bool = False, ignore_hashes=False, image_shape: Sequence[int] = [448, 448], read_codes=False, code_shape: Sequence[int] = [7, 7, 2048], merge_qa=False) -> None: log_message("Building Dataset " + data_type) self.image_resize_shape = image_shape self.read_codes = read_codes self.code_shape = code_shape self.merge_qa = merge_qa self.image_root_path = DATA_STORE["visual7w/data/images"] # Get all of the necessary data self.images_key = maybe_download_and_store_zip( 'http://vision.stanford.edu/yukezhu/visual7w_images.zip', 'visual7w/data/images', use_subkeys=False) # Get all of the necessary data self.dataset_key = maybe_download_and_store_zip( "http://web.stanford.edu/~yukez/papers/resources/dataset_v7w_{0}.zip" .format(data_type), 'visual7w/{0}/data/json'.format(data_type), use_subkeys=True) # Get the grounding data self.grounding_key = maybe_download_and_store_zip( "http://web.stanford.edu/~yukez/papers/resources/dataset_v7w_grounding_annotations.zip", "visual/data/grounding", use_subkeys=True) # Compute the size of the datasets self.num_train_examples = 0 self.num_val_examples = 0 self.num_test_examples = 0 self.max_word_length = 44 self.max_char_length = 26 self.data_type = data_type root_key = "visual7w/{0}".format(data_type) dict_key = os.path.join(root_key, "dictionary") # Load the vocab files if not ignore_hashes and (force_rebuild or not DATA_STORE.is_valid(dict_key)): self.dictionary = NLPDictionary() need_rebuild_train = True need_rebuild_val = True else: self.dictionary = NLPDictionary().load(DATA_STORE[dict_key]) self.train_fpath = os.path.join(root_key, 'tfrecord/train') self.val_fpath = os.path.join(root_key, 'tfrecord/val') self.test_fpath = os.path.join(root_key, 'tfrecord/test') if force_rebuild: # Now that we have the data, load and parse the JSON file file_ = DATA_STORE[self.dataset_key[0]] with open(file_, 'r') as ptr: self._json = json.load(ptr) self._build_images() self._build_dataset() else: # Compute the size of the datasets self.num_train_examples = sum( 1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[ os.path.join(self.train_fpath, "images")])) self.num_val_examples = sum( 1 for _ in tf.python_io.tf_record_iterator(DATA_STORE[ os.path.join(self.val_fpath, "images")])) # Setup some default options for the dataset self._val_db = None self._train_db = None self._test_db = None self.num_parallel_reads = num_parallel_reads # Save the vocab if force_rebuild: self.dictionary.save( DATA_STORE.create_key(dict_key, 'dict.pkl', force=True)) DATA_STORE.update_hash(dict_key) self.word_vocab_size = len(self.dictionary.word_dictionary) self.char_vocab_size = len(self.dictionary.char_dictionary)
import os from pathlib import Path import shutil from flux.backend.datastore import DataStore from flux.util.logging import log_message from datetime import datetime try: initialized except NameError: initialized = True log_message('Initializing...') # Get the values from the path def get_var(var_name: str, default: str) -> str: if os.environ.get(var_name) is None: return default else: return str(os.environ.get(var_name)) ROOT_FPATH = get_var('FLUX_ROOT', os.path.join(str(Path.home()), '.flux')) CONFIG_FOLDER = get_var('FLUX_CONFIG_FOLDER', '.flux_config') CURR_CONFIG = get_var('FLUX_CONFIG', 'flux_config.json') CONFIG_FILE = CURR_CONFIG def retrack_config(reset=False): global CURR_CONFIG, DATA_STORE, CONFIG_FILE
def __init__(self, version: str = None, num_parallel_reads: Optional[int] = None, force_rebuild=False, nohashcheck=False) -> None: log_message("Building NMT...") if not Dataset.has_space(NMT.REQ_SIZE): return if version == None: log_message( "Please Select From following translation: en-vi, en-de") return self.num_parallel_reads = num_parallel_reads self.num_val_examples = None self.num_train_examples = None self.num_test_examples = None self.mwl = 40 self.qwl = 40 site_prefix = "https://nlp.stanford.edu/projects/nmt/data/" root_key = "nmt" if version == 'en-vi': self.root_key = os.path.join(root_key, "en-vi") train_eng_file = os.path.join(site_prefix, "iwslt15.en-vi/train.en") train_for_file = os.path.join(site_prefix, "iwslt15.en-vi/train.vi") val_eng_file = os.path.join(site_prefix, "iwslt15.en-vi/tst2012.en") val_for_file = os.path.join(site_prefix, "iwslt15.en-vi/tst2012.vi") test_eng_file = os.path.join(site_prefix, "iwslt15.en-vi/tst2013.en") test_for_file = os.path.join(site_prefix, "iwslt15.en-vi/tst2013.vi") vocab_eng_file = os.path.join(site_prefix, "iwslt15.en-vi/vocab.en") vocab_for_file = os.path.join(site_prefix, "iwslt15.en-vi/vocab.vi") # size = {"train_eng_file": 13603614, # "train_for_file": 18074646, # "val_eng_file": 140250, # "val_for_file": 188396, # "test_eng_file": 132264, # "test_for_file": 183855, # "vocab_eng_file": 139741, # "vocab_for_file": 46767} elif version == "en-de": self.root_key = os.path.join(root_key, "en-de") train_eng_file = os.path.join(site_prefix, "wmt14.en-de/train.en") train_for_file = os.path.join(site_prefix, "wmt14.en-de/train.de") val_eng_file = os.path.join(site_prefix, "wmt14.en-de/newstest2012.en") val_for_file = os.path.join(site_prefix, "wmt14.en-de/newstest2012.de") test_eng_file = os.path.join(site_prefix, "wmt14.en-de/newstest2013.en") test_for_file = os.path.join(site_prefix, "wmt14.en-de/newstest2013.de") vocab_eng_file = os.path.join(site_prefix, "wmt14.en-de/vocab.50K.en") vocab_for_file = os.path.join(site_prefix, "wmt14.en-de/vocab.50K.de") # size = {"train_eng_file": 644874240, # "train_for_file": 717225984, # "val_eng_file": 406528, # "val_for_file": 470016, # "test_eng_file": 355328, # "test_for_file": 405504, # "vocab_eng_file": 404480, # "vocab_for_file": 504832} # Download Files self.train_eng = maybe_download_and_store_single_file( train_eng_file, os.path.join(self.root_key, "train-en")) self.train_for = maybe_download_and_store_single_file( train_for_file, os.path.join(self.root_key, "train-for")) self.val_eng = maybe_download_and_store_single_file( val_eng_file, os.path.join(self.root_key, "val-en")) self.val_for = maybe_download_and_store_single_file( val_for_file, os.path.join(self.root_key, "val-for")) self.test_eng = maybe_download_and_store_single_file( test_eng_file, os.path.join(self.root_key, "test-en")) self.test_for = maybe_download_and_store_single_file( test_for_file, os.path.join(self.root_key, "test-for")) self.vocab_eng = maybe_download_and_store_single_file( vocab_eng_file, os.path.join(self.root_key, "vocab-en")) self.vocab_for = maybe_download_and_store_single_file( vocab_for_file, os.path.join(self.root_key, "vocab-for")) # Load the vocab files src_dictionary_key = os.path.join(self.root_key, "dictionary", "en") for_dictionary_key = os.path.join(self.root_key, "dictionary", "for") if not DATA_STORE.is_valid( src_dictionary_key) or not DATA_STORE.is_valid( for_dictionary_key) or force_rebuild: self.src_dictionary = NLPDictionary() self.dst_dictionary = NLPDictionary() else: self.src_dictionary = NLPDictionary() self.dst_dictionary = NLPDictionary() self.src_dictionary.load(DATA_STORE[src_dictionary_key]) self.dst_dictionary.load(DATA_STORE[for_dictionary_key]) self.num_train_examples = self._build_dataset( "train", force_rebuild=force_rebuild) self.num_val_examples = self._build_dataset( "val", force_rebuild=force_rebuild) self.num_test_examples = self._build_dataset( "test", force_rebuild=force_rebuild) with open( DATA_STORE.create_key(src_dictionary_key, 'dict.pkl', force=True), 'wb') as pkl_file: pickle.dump(self.src_dictionary, pkl_file) DATA_STORE.update_hash(src_dictionary_key) with open( DATA_STORE.create_key(for_dictionary_key, 'dict.pkl', force=True), 'wb') as pkl_file: pickle.dump(self.dst_dictionary, pkl_file) DATA_STORE.update_hash(for_dictionary_key) self.word_vocab_size = len(self.src_dictionary.word_dictionary) # TODO: Add current vocab size from vocab file self._train_db = None self._val_db = None
def _build_images(self, ) -> None: # Define the Record Root # Open the TFRecordWriter train_record_root = os.path.join(self.train_fpath, "images") val_record_root = os.path.join(self.val_fpath, "images") test_record_root = os.path.join(self.test_fpath, "images") # Construct the record reader train_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(train_record_root, 'data.tfrecords', force=True)) val_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(val_record_root, 'data.tfrecords', force=True)) test_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(test_record_root, 'data.tfrecords', force=True)) # Loop over the data and parse errors = 0 log_message('Building the image...') images = self._json['images'] total_num_examples = len(images) for idx, entry in tqdm.tqdm(enumerate(images), total=total_num_examples): # Load the image filename = entry['filename'] image_path = os.path.join(self.image_root_path, "images", filename) assert os.path.exists(image_path) image = load_image(image_path) image_shape = list(image.shape) image = encode_jpeg(image) if image is None: errors += 1 log_warning( 'Error loading image: {}. {} Errors so far.'.format( os.path.join(self.image_root_path, "images", filename), errors)) continue # Split the dataset split = entry["split"] if split == "val": tf_record_writer = val_record_writer elif split == "test": tf_record_writer = test_record_writer else: tf_record_writer = train_record_writer image_id = entry['image_id'] feature = { 'image_size': _int64_feature(image_shape), 'image_id': _int64_feature([image_id]), 'image': _bytes_feature(tf.compat.as_bytes(image)), } # Write the TF-Record example = tf.train.Example(features=tf.train.Features( feature=feature)) tf_record_writer.write(example.SerializeToString()) val_record_writer.close() train_record_writer.close() test_record_writer.close() DATA_STORE.update_hash(test_record_root) DATA_STORE.update_hash(train_record_root) DATA_STORE.update_hash(val_record_root)
def __init__(self, num_parallel_reads: int=1, force_rebuild: bool=False, nohashcheck=False) -> None: # Amount of Space Check. if not Dataset.has_space(COCOCaptions.AMT_REQUIRED): return # Query for the data password if not DATA_STORE.is_valid('coco2014/data/annotations', nohashcheck=nohashcheck) or force_rebuild: maybe_download_and_store_zip('http://images.cocodataset.org/annotations/annotations_trainval2014.zip', 'coco2014/data/annotations', use_subkeys=False) if not DATA_STORE.is_valid('coco2014/data/train/images', nohashcheck=nohashcheck) or force_rebuild: maybe_download_and_store_zip('http://images.cocodataset.org/zips/train2014.zip', 'coco2014/data/train/images', use_subkeys=False) if not DATA_STORE.is_valid('coco2014/data/val/images', nohashcheck=nohashcheck) or force_rebuild: maybe_download_and_store_zip('http://images.cocodataset.org/zips/val2014.zip', 'coco2014/data/val/images', use_subkeys=False) # TODO ([email protected]) Need to make sure that this works - there could be download issues, but it's hard to say self.train_json_key = 'coco2014/data/annotations' self.val_json_key = 'coco2014/data/annotations' log_message("Finished Downloading") # Now that we have the data, load and parse the JSON files need_rebuild_train = force_rebuild if not DATA_STORE.is_valid('coco2014/tfrecord/train', nohashcheck=nohashcheck) or need_rebuild_train: need_rebuild_train = True with open(os.path.join(DATA_STORE[self.train_json_key], 'annotations/captions_train2014.json'), 'r') as annotation_file: self.train_json = json.loads(annotation_file.read()) need_rebuild_val = force_rebuild if not DATA_STORE.is_valid('coco2014/tfrecord/val', nohashcheck=nohashcheck) or need_rebuild_val: need_rebuild_val = True with open(os.path.join(DATA_STORE[self.val_json_key], 'annotations/captions_val2014.json'), 'r') as annotation_file: self.val_json = json.loads(annotation_file.read()) # Load the vocab files if not DATA_STORE.is_valid('coco2014/captions/dictionary') or force_rebuild: self.dictionary = NLPDictionary() need_rebuild_train = True need_rebuild_val = True else: self.dictionary = NLPDictionary() self.dictionary.load(DATA_STORE['coco2014/captions/dictionary']) # Setup some default options for the dataset self.max_word_length = 50 self.max_char_length = 16 self._val_db = None self._train_db = None self.num_parallel_reads = num_parallel_reads # Build the tfrecord dataset from the JSON if need_rebuild_train: self._build_dataset('train') if need_rebuild_val: self._build_dataset('val') self.train_fpath = DATA_STORE['coco2014/tfrecord/train'] self.val_fpath = DATA_STORE['coco2014/tfrecord/val'] log_message("Finished building tfrecords.") # # Compute the size of the datasets self.num_train_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['coco2014/tfrecord/train'])) self.num_val_examples = sum(1 for _ in tf.python_io.tf_record_iterator(DATA_STORE['coco2014/tfrecord/val'])) # Save the vocab dict_path = DATA_STORE.create_key('coco2014/captions/dictionary', 'dict.pkl', force=True) self.dictionary.save(dict_path) DATA_STORE.update_hash('coco2014/captions/dictionary') self.word_vocab_size = len(self.dictionary.word_dictionary) self.char_vocab_size = len(self.dictionary.char_dictionary)
def _build_dataset(self, dataset: str) -> None: # Open the TFRecordWriter if dataset == 'train': record_root = 'vqa/tfrecord/train' json_a = self.train_a_json json_q = self.train_q_json root_fpath = DATA_STORE['coco2014/data/train/images'] example_numbers = self.num_train_examples else: record_root = 'vqa/tfrecord/val' json_a = self.val_a_json json_q = self.val_q_json root_fpath = DATA_STORE['coco2014/data/val/images'] example_numbers = self.num_val_examples # Construct the record reader tf_record_writer = tf.python_io.TFRecordWriter( DATA_STORE.create_key(record_root, 'data.tfrecords', force=True)) # Loop over the data and parse errors = 0 log_message('Building {} dataset...'.format(dataset)) for idx, entry in tqdm.tqdm(enumerate(json_q['questions']), total=example_numbers): # Load the image image = load_image( build_fpath_from_image_id(root_fpath, entry['image_id'], dataset)) image = encode_jpeg(image) if image is None: errors += 1 log_warning( 'Error loading image: {}. {} Errors so far.'.format( build_fpath_from_image_id(root_fpath, entry['image_id'], dataset), errors)) continue # Parse the caption assert entry['question_id'] == json_a['annotations'][idx][ 'question_id'] question_raw = entry['question'] question_dense, question_len = self.dictionary.dense_parse( question_raw, word_padding=self.max_word_length, char_padding=self.max_char_length) answer_raw = json_a['annotations'][idx]['multiple_choice_answer'] answer_dense, answer_len = self.dictionary.dense_parse( answer_raw, word_padding=self.max_word_length, char_padding=self.max_char_length) # Add the class mapping if answer_raw not in self.class_map: self.class_map[answer_raw] = len(self.class_map) answer_class = self.class_map[answer_raw] # Add the image data feature = { 'question_word_embedding': _int64_feature(np.ravel(question_dense[0]).astype(np.int64)), 'question_char_embedding': _int64_feature(np.ravel(question_dense[1]).astype(np.int64)), 'question_length': _int64_feature([question_len]), 'answer_word_embedding': _int64_feature(np.ravel(answer_dense[0]).astype(np.int64)), 'answer_char_embedding': _int64_feature(np.ravel(answer_dense[1]).astype(np.int64)), 'answer_length': _int64_feature([answer_len]), 'answer_class': _int64_feature([answer_class]), 'image': _bytes_feature(tf.compat.as_bytes(image)), } # Write the TF-Record example = tf.train.Example(features=tf.train.Features( feature=feature)) tf_record_writer.write(example.SerializeToString()) tf_record_writer.close() DATA_STORE.update_hash(record_root)